langchain loading documents into vector storage

This commit is contained in:
2026-02-03 20:52:08 +03:00
parent 762ed89843
commit 8d7e39a603
5 changed files with 299 additions and 42 deletions

View File

@@ -2,11 +2,12 @@
import os
from typing import Optional
from langchain_community.vectorstores import Qdrant
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document
from qdrant_client import QdrantClient
from dotenv import load_dotenv
from langchain_community.vectorstores import Qdrant
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from qdrant_client import QdrantClient
# Load environment variables
load_dotenv()
@@ -21,16 +22,15 @@ OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
def initialize_vector_store(
collection_name: str = "documents",
recreate_collection: bool = False
collection_name: str = "documents_langchain", recreate_collection: bool = False
) -> Qdrant:
"""
Initialize and return a Qdrant vector store with Ollama embeddings.
Args:
collection_name: Name of the Qdrant collection to use
recreate_collection: Whether to recreate the collection if it exists
Returns:
Initialized Qdrant vector store
"""
@@ -39,44 +39,44 @@ def initialize_vector_store(
host=QDRANT_HOST,
port=QDRANT_REST_PORT,
)
# Initialize Ollama embeddings
embeddings = OllamaEmbeddings(
model=OLLAMA_EMBEDDING_MODEL,
base_url="http://localhost:11434" # Default Ollama URL
base_url="http://localhost:11434", # Default Ollama URL
)
# Create or get the vector store
vector_store = Qdrant(
client=client,
collection_name=collection_name,
embeddings=embeddings,
)
# If recreate_collection is True, we'll delete and recreate the collection
if recreate_collection and collection_name in [col.name for col in client.get_collections().collections]:
if recreate_collection and collection_name in [
col.name for col in client.get_collections().collections
]:
client.delete_collection(collection_name)
# Recreate with proper configuration
vector_store = Qdrant.from_documents(
documents=[],
embedding=embeddings,
url=f"http://{QDRANT_HOST}:{QDRANT_REST_PORT}",
collection_name=collection_name,
force_recreate=True
force_recreate=True,
)
return vector_store
def add_documents_to_vector_store(
vector_store: Qdrant,
documents: list[Document],
batch_size: int = 10
vector_store: Qdrant, documents: list[Document], batch_size: int = 10
) -> None:
"""
Add documents to the vector store.
Args:
vector_store: Initialized Qdrant vector store
documents: List of documents to add
@@ -84,23 +84,19 @@ def add_documents_to_vector_store(
"""
# Add documents to the vector store in batches
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
batch = documents[i : i + batch_size]
vector_store.add_documents(batch)
def search_vector_store(
vector_store: Qdrant,
query: str,
top_k: int = 5
) -> list:
def search_vector_store(vector_store: Qdrant, query: str, top_k: int = 5) -> list:
"""
Search the vector store for similar documents.
Args:
vector_store: Initialized Qdrant vector store
query: Query string to search for
top_k: Number of top results to return
Returns:
List of similar documents
"""
@@ -127,27 +123,29 @@ def initialize_vector_store_with_openrouter(
host=QDRANT_HOST,
port=QDRANT_REST_PORT,
)
# Initialize OpenAI embeddings via OpenRouter
embeddings = OpenAIEmbeddings(
model=OPENROUTER_EMBEDDING_MODEL,
openai_api_key=OPENROUTER_API_KEY,
openai_api_base="https://openrouter.ai/api/v1"
)
# Create or get the vector store
vector_store = Qdrant(
client=client,
collection_name=collection_name,
embeddings=embeddings,
)
return vector_store
"""
if __name__ == "__main__":
# Example usage
print(f"Initializing vector store with Ollama embedding model: {OLLAMA_EMBEDDING_MODEL}")
print(
f"Initializing vector store with Ollama embedding model: {OLLAMA_EMBEDDING_MODEL}"
)
vector_store = initialize_vector_store()
print("Vector store initialized successfully!")
print("Vector store initialized successfully!")