langchain loading documents into vector storage

2026-02-03 20:52:08 +03:00
parent 762ed89843
commit 8d7e39a603
5 changed files with 299 additions and 42 deletions
--- a/services/rag/langchain/vector_storage.py
+++ b/services/rag/langchain/vector_storage.py
@@ -2,11 +2,12 @@

 import os
 from typing import Optional
-from langchain_community.vectorstores import Qdrant
-from langchain_ollama import OllamaEmbeddings
-from langchain_core.documents import Document
-from qdrant_client import QdrantClient
+
 from dotenv import load_dotenv
+from langchain_community.vectorstores import Qdrant
+from langchain_core.documents import Document
+from langchain_ollama import OllamaEmbeddings
+from qdrant_client import QdrantClient

 # Load environment variables
 load_dotenv()
@@ -21,16 +22,15 @@ OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")


 def initialize_vector_store(
-    collection_name: str = "documents",
-    recreate_collection: bool = False
+    collection_name: str = "documents_langchain", recreate_collection: bool = False
 ) -> Qdrant:
    """
    Initialize and return a Qdrant vector store with Ollama embeddings.
-    
+
    Args:
        collection_name: Name of the Qdrant collection to use
        recreate_collection: Whether to recreate the collection if it exists
-    
+
    Returns:
        Initialized Qdrant vector store
    """
@@ -39,44 +39,44 @@ def initialize_vector_store(
        host=QDRANT_HOST,
        port=QDRANT_REST_PORT,
    )
-    
+
    # Initialize Ollama embeddings
    embeddings = OllamaEmbeddings(
        model=OLLAMA_EMBEDDING_MODEL,
-        base_url="http://localhost:11434"  # Default Ollama URL
+        base_url="http://localhost:11434",  # Default Ollama URL
    )
-    
+
    # Create or get the vector store
    vector_store = Qdrant(
        client=client,
        collection_name=collection_name,
        embeddings=embeddings,
    )
-    
+
    # If recreate_collection is True, we'll delete and recreate the collection
-    if recreate_collection and collection_name in [col.name for col in client.get_collections().collections]:
+    if recreate_collection and collection_name in [
+        col.name for col in client.get_collections().collections
+    ]:
        client.delete_collection(collection_name)
-        
+
        # Recreate with proper configuration
        vector_store = Qdrant.from_documents(
            documents=[],
            embedding=embeddings,
            url=f"http://{QDRANT_HOST}:{QDRANT_REST_PORT}",
            collection_name=collection_name,
-            force_recreate=True
+            force_recreate=True,
        )
-    
+
    return vector_store


 def add_documents_to_vector_store(
-    vector_store: Qdrant,
-    documents: list[Document],
-    batch_size: int = 10
+    vector_store: Qdrant, documents: list[Document], batch_size: int = 10
 ) -> None:
    """
    Add documents to the vector store.
-    
+
    Args:
        vector_store: Initialized Qdrant vector store
        documents: List of documents to add
@@ -84,23 +84,19 @@ def add_documents_to_vector_store(
    """
    # Add documents to the vector store in batches
    for i in range(0, len(documents), batch_size):
-        batch = documents[i:i + batch_size]
+        batch = documents[i : i + batch_size]
        vector_store.add_documents(batch)


-def search_vector_store(
-    vector_store: Qdrant,
-    query: str,
-    top_k: int = 5
-) -> list:
+def search_vector_store(vector_store: Qdrant, query: str, top_k: int = 5) -> list:
    """
    Search the vector store for similar documents.
-    
+
    Args:
        vector_store: Initialized Qdrant vector store
        query: Query string to search for
        top_k: Number of top results to return
-    
+
    Returns:
        List of similar documents
    """
@@ -127,27 +123,29 @@ def initialize_vector_store_with_openrouter(
        host=QDRANT_HOST,
        port=QDRANT_REST_PORT,
    )
-    
+
    # Initialize OpenAI embeddings via OpenRouter
    embeddings = OpenAIEmbeddings(
        model=OPENROUTER_EMBEDDING_MODEL,
        openai_api_key=OPENROUTER_API_KEY,
        openai_api_base="https://openrouter.ai/api/v1"
    )
-    
+
    # Create or get the vector store
    vector_store = Qdrant(
        client=client,
        collection_name=collection_name,
        embeddings=embeddings,
    )
-    
+
    return vector_store
 """


 if __name__ == "__main__":
    # Example usage
-    print(f"Initializing vector store with Ollama embedding model: {OLLAMA_EMBEDDING_MODEL}")
+    print(
+        f"Initializing vector store with Ollama embedding model: {OLLAMA_EMBEDDING_MODEL}"
+    )
    vector_store = initialize_vector_store()
-    print("Vector store initialized successfully!")
+    print("Vector store initialized successfully!")