Files
rag-solution/services/rag/langchain/retrieval.py

158 lines
5.3 KiB
Python
Raw Normal View History

2026-02-03 23:25:24 +03:00
"""Retrieval module for querying vector storage and returning relevant documents with metadata."""
import os
from typing import List, Optional
2026-02-05 00:08:59 +03:00
from dotenv import load_dotenv
2026-02-03 23:25:24 +03:00
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from loguru import logger
from vector_storage import initialize_vector_store
2026-02-05 00:08:59 +03:00
# Load environment variables
load_dotenv()
2026-02-03 23:25:24 +03:00
class VectorStoreRetriever(BaseRetriever):
"""
A custom retriever that uses the Qdrant vector store to retrieve relevant documents.
"""
vector_store: object # Qdrant vector store instance
top_k: int = 5 # Number of documents to retrieve
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
"""
Retrieve relevant documents based on the query.
Args:
query: The query string to search for
run_manager: Callback manager for the run
Returns:
List of relevant documents with metadata
"""
logger.info(f"Searching for documents related to query: {query[:50]}...")
try:
# Perform similarity search on the vector store
results = self.vector_store.similarity_search(query, k=self.top_k)
logger.info(f"Found {len(results)} relevant documents")
return results
except Exception as e:
logger.error(f"Error during similarity search: {str(e)}")
return []
def create_retriever(collection_name: str = "documents_langchain", top_k: int = 5):
"""
Create and return a retriever instance connected to the vector store.
Args:
collection_name: Name of the Qdrant collection to use
top_k: Number of documents to retrieve
Returns:
VectorStoreRetriever instance
"""
logger.info(f"Initializing vector store for retrieval from collection: {collection_name}")
# Initialize the vector store
vector_store = initialize_vector_store(collection_name=collection_name)
# Create and return the retriever
retriever = VectorStoreRetriever(vector_store=vector_store, top_k=top_k)
return retriever
def search_documents(query: str, collection_name: str = "documents_langchain", top_k: int = 5) -> List[Document]:
"""
Search for documents in the vector store based on the query.
Args:
query: The query string to search for
collection_name: Name of the Qdrant collection to use
top_k: Number of documents to retrieve
Returns:
List of documents with metadata
"""
logger.info(f"Starting document search for query: {query}")
# Create the retriever
retriever = create_retriever(collection_name=collection_name, top_k=top_k)
# Perform the search
results = retriever.invoke(query)
logger.info(f"Search completed, returned {len(results)} documents")
return results
def search_documents_with_metadata(
query: str,
collection_name: str = "documents_langchain",
top_k: int = 5
) -> List[dict]:
"""
Search for documents and return them with detailed metadata.
Args:
query: The query string to search for
collection_name: Name of the Qdrant collection to use
top_k: Number of documents to retrieve
Returns:
List of dictionaries containing document content and metadata
"""
logger.info(f"Starting document search with metadata for query: {query}")
# Initialize the vector store
vector_store = initialize_vector_store(collection_name=collection_name)
try:
# Standard similarity search
documents = vector_store.similarity_search(query, k=top_k)
# Format results to include content and metadata
formatted_results = []
for doc in documents:
formatted_result = {
"content": doc.page_content,
"metadata": doc.metadata,
"source": doc.metadata.get("source", "Unknown"),
"filename": doc.metadata.get("filename", "Unknown"),
"page_number": doc.metadata.get("page_number", doc.metadata.get("page", "N/A")),
"file_extension": doc.metadata.get("file_extension", "N/A"),
"file_size": doc.metadata.get("file_size", "N/A")
}
formatted_results.append(formatted_result)
logger.info(f"Metadata search completed, returned {len(formatted_results)} documents")
return formatted_results
except Exception as e:
logger.error(f"Error during document search with metadata: {str(e)}")
return []
if __name__ == "__main__":
# Example usage
query = "What is the main topic discussed in the documents?"
results = search_documents_with_metadata(query, top_k=5)
print(f"Found {len(results)} documents:")
for i, result in enumerate(results, 1):
print(f"\n{i}. Source: {result['source']}")
print(f" Filename: {result['filename']}")
print(f" Page: {result['page_number']}")
print(f" Content preview: {result['content'][:200]}...")
print(f" Metadata: {result['metadata']}")