langchain loading documents into vector storage

2026-02-03 20:52:08 +03:00
parent 762ed89843
commit 8d7e39a603
5 changed files with 299 additions and 42 deletions
--- a/services/rag/langchain/PLANNING.md
+++ b/services/rag/langchain/PLANNING.md
@@ -26,9 +26,9 @@ Chosen data folder: relatve ./../../../data - from the current folder

 # Phase 4 (creating module for loading documents from the folder)

- [ ] Create file `enrichment.py` with the function that will load data with configured data loaders for extensions from the data folder into the chosen vector storage. Remember to specify default embeddings meta properties, such as filename, paragraph, page, section, wherever this is possible (documents can have pages, sections, paragraphs, etc). Use text splitters of the chosen RAG framework accordingly to the documents being loaded. Which chunking/text-splitting strategies framework has, can be learned online.
- [ ] Use built-in strategy for marking which documents loaded (if there is such mechanism) and which are not, to avoid re-reading and re-encriching vector storage with the existing data. If there is no built-in mechanism of this type, install sqlite library and use local sqlite database file to store this information.
- [ ] Add activation of this function in the cli entrypoint, as a command.
+- [x] Create file `enrichment.py` with the function that will load data with configured data loaders for extensions from the data folder into the chosen vector storage. Remember to specify default embeddings meta properties, such as filename, paragraph, page, section, wherever this is possible (documents can have pages, sections, paragraphs, etc). Use text splitters of the chosen RAG framework accordingly to the documents being loaded. Which chunking/text-splitting strategies framework has, can be learned online.
+- [x] Use built-in strategy for marking which documents loaded (if there is such mechanism) and which are not, to avoid re-reading and re-encriching vector storage with the existing data. If there is no built-in mechanism of this type, install sqlite library and use local sqlite database file to store this information.
+- [x] Add activation of this function in the cli entrypoint, as a command.

 # Phase 5 (preparation for the retrieval feature)

--- a/services/rag/langchain/QWEN.md
+++ b/services/rag/langchain/QWEN.md
@@ -24,6 +24,7 @@ rag-solution/services/rag/langchain/
 ├── app.py             # Main application file (currently empty)
 ├── cli.py             # CLI entrypoint with click library
 ├── EXTENSIONS.md      # Supported file extensions and LangChain loaders
+├── enrichment.py      # Document enrichment module for loading documents to vector storage
 ├── PLANNING.md        # Development roadmap and phases
 ├── QWEN.md            # Current file - project context
 ├── requirements.txt   # Python dependencies
@@ -64,10 +65,10 @@ The project is organized into 6 development phases as outlined in `PLANNING.md`:
 - [x] Prepare OpenAI fallback (commented)

 ### Phase 4: Document Loading Module
- [ ] Create `enrichment.py` for loading documents to vector storage
- [ ] Implement text splitting strategies
- [ ] Add document tracking to prevent re-processing
- [ ] Integrate with CLI
+- [x] Create `enrichment.py` for loading documents to vector storage
+- [x] Implement text splitting strategies
+- [x] Add document tracking to prevent re-processing
+- [x] Integrate with CLI

 ### Phase 5: Retrieval Feature
 - [ ] Create `retrieval.py` for querying vector storage
--- a/services/rag/langchain/cli.py
+++ b/services/rag/langchain/cli.py
@@ -28,5 +28,31 @@ def ping():
    click.echo("pong")


+@cli.command(name="enrich", help="Load documents from data directory and store in vector database")
+@click.option('--data-dir', default="../../../data", help="Path to the data directory")
+@click.option('--collection-name', default="documents", help="Name of the vector store collection")
+def enrich(data_dir, collection_name):
+    """Load documents from data directory and store in vector database"""
+    logger.info(f"Starting enrichment process for directory: {data_dir}")
+
+    try:
+        # Import here to avoid circular dependencies
+        from vector_storage import initialize_vector_store
+        from enrichment import run_enrichment_process
+
+        # Initialize vector store
+        vector_store = initialize_vector_store(collection_name=collection_name)
+
+        # Run enrichment process
+        run_enrichment_process(vector_store, data_dir=data_dir)
+
+        logger.info("Enrichment process completed successfully!")
+        click.echo("Documents have been successfully loaded into the vector store.")
+
+    except Exception as e:
+        logger.error(f"Error during enrichment process: {str(e)}")
+        click.echo(f"Error: {str(e)}")
+
+
 if __name__ == "__main__":
    cli()
--- a/services/rag/langchain/enrichment.py
+++ b/services/rag/langchain/enrichment.py
@@ -0,0 +1,232 @@
+"""Document enrichment module for loading documents into vector storage."""
+
+import os
+import hashlib
+from pathlib import Path
+from typing import List, Dict, Any
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import (
+    PyPDFLoader,
+    UnstructuredWordDocumentLoader,
+    UnstructuredPowerPointLoader,
+    PandasExcelLoader,
+    UnstructuredImageLoader,
+    UnstructuredODTLoader,
+)
+from sqlalchemy import create_engine, Column, Integer, String
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+from loguru import logger
+import sqlite3
+
+
+# Define the path to the data directory
+DATA_DIR = Path("../../../data").resolve()
+DB_PATH = Path("document_tracking.db").resolve()
+
+Base = declarative_base()
+
+
+class ProcessedDocument(Base):
+    """Database model for tracking processed documents."""
+    __tablename__ = "processed_documents"
+    
+    id = Column(Integer, primary_key=True)
+    file_path = Column(String, unique=True, nullable=False)
+    file_hash = Column(String, nullable=False)
+
+
+class DocumentEnricher:
+    """Class responsible for enriching documents and loading them to vector storage."""
+    
+    def __init__(self, vector_store):
+        self.vector_store = vector_store
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len,
+        )
+        
+        # Initialize database for tracking processed documents
+        self._init_db()
+    
+    def _init_db(self):
+        """Initialize the SQLite database for tracking processed documents."""
+        self.engine = create_engine(f"sqlite:///{DB_PATH}")
+        Base.metadata.create_all(self.engine)
+        Session = sessionmaker(bind=self.engine)
+        self.session = Session()
+    
+    def _get_file_hash(self, file_path: str) -> str:
+        """Calculate SHA256 hash of a file."""
+        hash_sha256 = hashlib.sha256()
+        with open(file_path, "rb") as f:
+            # Read file in chunks to handle large files
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_sha256.update(chunk)
+        return hash_sha256.hexdigest()
+    
+    def _is_document_processed(self, file_path: str) -> bool:
+        """Check if a document has already been processed."""
+        file_hash = self._get_file_hash(file_path)
+        existing = self.session.query(ProcessedDocument).filter_by(
+            file_hash=file_hash
+        ).first()
+        return existing is not None
+    
+    def _mark_document_processed(self, file_path: str):
+        """Mark a document as processed in the database."""
+        file_hash = self._get_file_hash(file_path)
+        doc_record = ProcessedDocument(
+            file_path=file_path,
+            file_hash=file_hash
+        )
+        self.session.add(doc_record)
+        self.session.commit()
+    
+    def _get_loader_for_extension(self, file_path: str):
+        """Get the appropriate loader for a given file extension."""
+        ext = Path(file_path).suffix.lower()
+        
+        if ext == ".pdf":
+            return PyPDFLoader(file_path)
+        elif ext in [".docx", ".doc"]:
+            return UnstructuredWordDocumentLoader(file_path)
+        elif ext == ".pptx":
+            return UnstructuredPowerPointLoader(file_path)
+        elif ext in [".xlsx", ".xls"]:
+            return PandasExcelLoader(file_path)
+        elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
+            return UnstructuredImageLoader(file_path)
+        elif ext == ".odt":
+            return UnstructuredODTLoader(file_path)
+        else:
+            # For text files and unsupported formats, try to load as text
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                return None, content  # Return content directly for text processing
+            except UnicodeDecodeError:
+                logger.warning(f"Could not decode file as text: {file_path}")
+                return None, None
+    
+    def load_and_split_documents(self, file_paths: List[str]) -> List[Document]:
+        """Load documents from file paths and split them appropriately."""
+        all_docs = []
+        
+        for file_path in file_paths:
+            if self._is_document_processed(file_path):
+                logger.info(f"Skipping already processed document: {file_path}")
+                continue
+            
+            logger.info(f"Processing document: {file_path}")
+            
+            # Get the appropriate loader for the file extension
+            loader = self._get_loader_for_extension(file_path)
+            
+            if loader is None:
+                # For unsupported formats that we tried to load as text
+                continue
+            
+            try:
+                # Load the document(s)
+                docs = loader.load()
+                
+                # Add metadata to each document
+                for doc in docs:
+                    # Extract metadata from the original file
+                    doc.metadata["source"] = file_path
+                    doc.metadata["filename"] = Path(file_path).name
+                    doc.metadata["file_path"] = file_path
+                    doc.metadata["file_size"] = os.path.getsize(file_path)
+                    
+                    # Add page number if available in original metadata
+                    if "page" in doc.metadata:
+                        doc.metadata["page_number"] = doc.metadata["page"]
+                    
+                    # Add file extension as metadata
+                    doc.metadata["file_extension"] = Path(file_path).suffix
+                
+                # Split documents if they are too large
+                split_docs = self.text_splitter.split_documents(docs)
+                
+                # Add to the collection
+                all_docs.extend(split_docs)
+                
+                # Mark document as processed
+                self._mark_document_processed(file_path)
+                
+            except Exception as e:
+                logger.error(f"Error processing {file_path}: {str(e)}")
+                continue
+        
+        return all_docs
+    
+    def enrich_and_store(self, file_paths: List[str]):
+        """Load, enrich, and store documents in the vector store."""
+        logger.info(f"Starting enrichment process for {len(file_paths)} files...")
+        
+        # Load and split documents
+        documents = self.load_and_split_documents(file_paths)
+        
+        if not documents:
+            logger.info("No new documents to process.")
+            return
+        
+        logger.info(f"Loaded and split {len(documents)} document chunks, adding to vector store...")
+        
+        # Add documents to vector store
+        self.vector_store.add_documents(documents)
+        
+        logger.info(f"Successfully added {len(documents)} document chunks to vector store.")
+
+
+def get_all_documents_from_data_dir(data_dir: str = str(DATA_DIR)) -> List[str]:
+    """Get all supported document file paths from the data directory."""
+    supported_extensions = {
+        '.pdf', '.docx', '.doc', '.pptx', '.xlsx', '.xls', 
+        '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', 
+        '.webp', '.odt'
+    }
+    
+    file_paths = []
+    for root, dirs, files in os.walk(data_dir):
+        for file in files:
+            if Path(file).suffix.lower() in supported_extensions:
+                file_paths.append(os.path.join(root, file))
+    
+    return file_paths
+
+
+def run_enrichment_process(vector_store, data_dir: str = str(DATA_DIR)):
+    """Run the full enrichment process."""
+    logger.info(f"Starting document enrichment from directory: {data_dir}")
+    
+    # Get all supported documents from the data directory
+    file_paths = get_all_documents_from_data_dir(data_dir)
+    
+    if not file_paths:
+        logger.warning(f"No supported documents found in {data_dir}")
+        return
+    
+    logger.info(f"Found {len(file_paths)} documents to process")
+    
+    # Initialize the document enricher
+    enricher = DocumentEnricher(vector_store)
+    
+    # Run the enrichment process
+    enricher.enrich_and_store(file_paths)
+    
+    logger.info("Document enrichment process completed!")
+
+
+if __name__ == "__main__":
+    # Example usage
+    from vector_storage import initialize_vector_store
+    
+    # Initialize vector store
+    vector_store = initialize_vector_store()
+    
+    # Run enrichment process
+    run_enrichment_process(vector_store)
--- a/services/rag/langchain/vector_storage.py
+++ b/services/rag/langchain/vector_storage.py
@@ -2,11 +2,12 @@

 import os
 from typing import Optional
-from langchain_community.vectorstores import Qdrant
-from langchain_ollama import OllamaEmbeddings
-from langchain_core.documents import Document
-from qdrant_client import QdrantClient
+
 from dotenv import load_dotenv
+from langchain_community.vectorstores import Qdrant
+from langchain_core.documents import Document
+from langchain_ollama import OllamaEmbeddings
+from qdrant_client import QdrantClient

 # Load environment variables
 load_dotenv()
@@ -21,8 +22,7 @@ OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")


 def initialize_vector_store(
-    collection_name: str = "documents",
-    recreate_collection: bool = False
+    collection_name: str = "documents_langchain", recreate_collection: bool = False
 ) -> Qdrant:
    """
    Initialize and return a Qdrant vector store with Ollama embeddings.
@@ -43,7 +43,7 @@ def initialize_vector_store(
    # Initialize Ollama embeddings
    embeddings = OllamaEmbeddings(
        model=OLLAMA_EMBEDDING_MODEL,
-        base_url="http://localhost:11434"  # Default Ollama URL
+        base_url="http://localhost:11434",  # Default Ollama URL
    )

    # Create or get the vector store
@@ -54,7 +54,9 @@ def initialize_vector_store(
    )

    # If recreate_collection is True, we'll delete and recreate the collection
-    if recreate_collection and collection_name in [col.name for col in client.get_collections().collections]:
+    if recreate_collection and collection_name in [
+        col.name for col in client.get_collections().collections
+    ]:
        client.delete_collection(collection_name)

        # Recreate with proper configuration
@@ -63,16 +65,14 @@ def initialize_vector_store(
            embedding=embeddings,
            url=f"http://{QDRANT_HOST}:{QDRANT_REST_PORT}",
            collection_name=collection_name,
-            force_recreate=True
+            force_recreate=True,
        )

    return vector_store


 def add_documents_to_vector_store(
-    vector_store: Qdrant,
-    documents: list[Document],
-    batch_size: int = 10
+    vector_store: Qdrant, documents: list[Document], batch_size: int = 10
 ) -> None:
    """
    Add documents to the vector store.
@@ -88,11 +88,7 @@ def add_documents_to_vector_store(
        vector_store.add_documents(batch)


-def search_vector_store(
-    vector_store: Qdrant,
-    query: str,
-    top_k: int = 5
-) -> list:
+def search_vector_store(vector_store: Qdrant, query: str, top_k: int = 5) -> list:
    """
    Search the vector store for similar documents.

@@ -148,6 +144,8 @@ def initialize_vector_store_with_openrouter(

 if __name__ == "__main__":
    # Example usage
-    print(f"Initializing vector store with Ollama embedding model: {OLLAMA_EMBEDDING_MODEL}")
+    print(
+        f"Initializing vector store with Ollama embedding model: {OLLAMA_EMBEDDING_MODEL}"
+    )
    vector_store = initialize_vector_store()
    print("Vector store initialized successfully!")