"""Retrieval module for querying vector storage and returning relevant documents with metadata.""" import os from typing import List, Optional from dotenv import load_dotenv from langchain_core.callbacks import CallbackManagerForRetrieverRun from langchain_core.documents import Document from langchain_core.retrievers import BaseRetriever from loguru import logger from vector_storage import initialize_vector_store # Load environment variables load_dotenv() class VectorStoreRetriever(BaseRetriever): """ A custom retriever that uses the Qdrant vector store to retrieve relevant documents. """ vector_store: object # Qdrant vector store instance top_k: int = 5 # Number of documents to retrieve def _get_relevant_documents( self, query: str, *, run_manager: CallbackManagerForRetrieverRun ) -> List[Document]: """ Retrieve relevant documents based on the query. Args: query: The query string to search for run_manager: Callback manager for the run Returns: List of relevant documents with metadata """ logger.info(f"Searching for documents related to query: {query[:50]}...") try: # Perform similarity search on the vector store results = self.vector_store.similarity_search(query, k=self.top_k) logger.info(f"Found {len(results)} relevant documents") return results except Exception as e: logger.error(f"Error during similarity search: {str(e)}") return [] def create_retriever(collection_name: str = "documents_langchain", top_k: int = 5): """ Create and return a retriever instance connected to the vector store. Args: collection_name: Name of the Qdrant collection to use top_k: Number of documents to retrieve Returns: VectorStoreRetriever instance """ logger.info( f"Initializing vector store for retrieval from collection: {collection_name}" ) # Initialize the vector store vector_store = initialize_vector_store(collection_name=collection_name) # Create and return the retriever retriever = VectorStoreRetriever(vector_store=vector_store, top_k=top_k) return retriever def search_documents_with_metadata( query: str, collection_name: str = "documents_langchain", top_k: int = 5 ) -> List[dict]: """ Search for documents and return them with detailed metadata. Args: query: The query string to search for collection_name: Name of the Qdrant collection to use top_k: Number of documents to retrieve Returns: List of dictionaries containing document content and metadata """ logger.info(f"Starting document search with metadata for query: {query}") # Initialize the vector store vector_store = initialize_vector_store(collection_name=collection_name) try: # Standard similarity search documents = vector_store.similarity_search(query, k=top_k) # Format results to include content and metadata formatted_results = [] for doc in documents: formatted_result = { "content": doc.page_content, "metadata": doc.metadata, "source": doc.metadata.get("source", "Unknown"), "filename": doc.metadata.get("filename", "Unknown"), "page_number": doc.metadata.get( "page_number", doc.metadata.get("page", "N/A") ), "file_extension": doc.metadata.get("file_extension", "N/A"), "file_size": doc.metadata.get("file_size", "N/A"), } formatted_results.append(formatted_result) logger.info( f"Metadata search completed, returned {len(formatted_results)} documents" ) return formatted_results except Exception as e: logger.error(f"Error during document search with metadata: {str(e)}") return []