rag-solution/services/rag/langchain/retrieval.py

"""Retrieval module for querying vector storage and returning relevant documents with metadata."""

import os
from typing import List, Optional
from dotenv import load_dotenv
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from loguru import logger

from vector_storage import initialize_vector_store

# Load environment variables
load_dotenv()


class VectorStoreRetriever(BaseRetriever):
    """
    A custom retriever that uses the Qdrant vector store to retrieve relevant documents.
    """
    
    vector_store: object  # Qdrant vector store instance
    top_k: int = 5  # Number of documents to retrieve
    
    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """
        Retrieve relevant documents based on the query.
        
        Args:
            query: The query string to search for
            run_manager: Callback manager for the run
            
        Returns:
            List of relevant documents with metadata
        """
        logger.info(f"Searching for documents related to query: {query[:50]}...")
        
        try:
            # Perform similarity search on the vector store
            results = self.vector_store.similarity_search(query, k=self.top_k)
            
            logger.info(f"Found {len(results)} relevant documents")
            
            return results
        except Exception as e:
            logger.error(f"Error during similarity search: {str(e)}")
            return []


def create_retriever(collection_name: str = "documents_langchain", top_k: int = 5):
    """
    Create and return a retriever instance connected to the vector store.
    
    Args:
        collection_name: Name of the Qdrant collection to use
        top_k: Number of documents to retrieve
        
    Returns:
        VectorStoreRetriever instance
    """
    logger.info(f"Initializing vector store for retrieval from collection: {collection_name}")
    
    # Initialize the vector store
    vector_store = initialize_vector_store(collection_name=collection_name)
    
    # Create and return the retriever
    retriever = VectorStoreRetriever(vector_store=vector_store, top_k=top_k)
    
    return retriever


def search_documents(query: str, collection_name: str = "documents_langchain", top_k: int = 5) -> List[Document]:
    """
    Search for documents in the vector store based on the query.
    
    Args:
        query: The query string to search for
        collection_name: Name of the Qdrant collection to use
        top_k: Number of documents to retrieve
        
    Returns:
        List of documents with metadata
    """
    logger.info(f"Starting document search for query: {query}")
    
    # Create the retriever
    retriever = create_retriever(collection_name=collection_name, top_k=top_k)
    
    # Perform the search
    results = retriever.invoke(query)
    
    logger.info(f"Search completed, returned {len(results)} documents")
    
    return results


def search_documents_with_metadata(
    query: str,
    collection_name: str = "documents_langchain",
    top_k: int = 5
) -> List[dict]:
    """
    Search for documents and return them with detailed metadata.

    Args:
        query: The query string to search for
        collection_name: Name of the Qdrant collection to use
        top_k: Number of documents to retrieve

    Returns:
        List of dictionaries containing document content and metadata
    """
    logger.info(f"Starting document search with metadata for query: {query}")

    # Initialize the vector store
    vector_store = initialize_vector_store(collection_name=collection_name)

    try:
        # Standard similarity search
        documents = vector_store.similarity_search(query, k=top_k)

        # Format results to include content and metadata
        formatted_results = []
        for doc in documents:
            formatted_result = {
                "content": doc.page_content,
                "metadata": doc.metadata,
                "source": doc.metadata.get("source", "Unknown"),
                "filename": doc.metadata.get("filename", "Unknown"),
                "page_number": doc.metadata.get("page_number", doc.metadata.get("page", "N/A")),
                "file_extension": doc.metadata.get("file_extension", "N/A"),
                "file_size": doc.metadata.get("file_size", "N/A")
            }
            formatted_results.append(formatted_result)

        logger.info(f"Metadata search completed, returned {len(formatted_results)} documents")

        return formatted_results

    except Exception as e:
        logger.error(f"Error during document search with metadata: {str(e)}")
        return []


if __name__ == "__main__":
    # Example usage
    query = "What is the main topic discussed in the documents?"
    results = search_documents_with_metadata(query, top_k=5)
    
    print(f"Found {len(results)} documents:")
    for i, result in enumerate(results, 1):
        print(f"\n{i}. Source: {result['source']}")
        print(f"   Filename: {result['filename']}")
        print(f"   Page: {result['page_number']}")
        print(f"   Content preview: {result['content'][:200]}...")
        print(f"   Metadata: {result['metadata']}")
Working retrieval with the cli 2026-02-03 23:25:24 +03:00			`"""Retrieval module for querying vector storage and returning relevant documents with metadata."""`

			`import os`
			`from typing import List, Optional`
properly loading .env file with dotenv 2026-02-05 00:08:59 +03:00			`from dotenv import load_dotenv`
Working retrieval with the cli 2026-02-03 23:25:24 +03:00			`from langchain_core.retrievers import BaseRetriever`
			`from langchain_core.callbacks import CallbackManagerForRetrieverRun`
			`from langchain_core.documents import Document`
			`from loguru import logger`

			`from vector_storage import initialize_vector_store`

properly loading .env file with dotenv 2026-02-05 00:08:59 +03:00			`# Load environment variables`
			`load_dotenv()`

Working retrieval with the cli 2026-02-03 23:25:24 +03:00
			`class VectorStoreRetriever(BaseRetriever):`
			`"""`
			`A custom retriever that uses the Qdrant vector store to retrieve relevant documents.`
			`"""`

			`vector_store: object # Qdrant vector store instance`
			`top_k: int = 5 # Number of documents to retrieve`

			`def _get_relevant_documents(`
			`self, query: str, *, run_manager: CallbackManagerForRetrieverRun`
			`) -> List[Document]:`
			`"""`
			`Retrieve relevant documents based on the query.`

			`Args:`
			`query: The query string to search for`
			`run_manager: Callback manager for the run`

			`Returns:`
			`List of relevant documents with metadata`
			`"""`
			`logger.info(f"Searching for documents related to query: {query[:50]}...")`

			`try:`
			`# Perform similarity search on the vector store`
			`results = self.vector_store.similarity_search(query, k=self.top_k)`

			`logger.info(f"Found {len(results)} relevant documents")`

			`return results`
			`except Exception as e:`
			`logger.error(f"Error during similarity search: {str(e)}")`
			`return []`


			`def create_retriever(collection_name: str = "documents_langchain", top_k: int = 5):`
			`"""`
			`Create and return a retriever instance connected to the vector store.`

			`Args:`
			`collection_name: Name of the Qdrant collection to use`
			`top_k: Number of documents to retrieve`

			`Returns:`
			`VectorStoreRetriever instance`
			`"""`
			`logger.info(f"Initializing vector store for retrieval from collection: {collection_name}")`

			`# Initialize the vector store`
			`vector_store = initialize_vector_store(collection_name=collection_name)`

			`# Create and return the retriever`
			`retriever = VectorStoreRetriever(vector_store=vector_store, top_k=top_k)`

			`return retriever`


			`def search_documents(query: str, collection_name: str = "documents_langchain", top_k: int = 5) -> List[Document]:`
			`"""`
			`Search for documents in the vector store based on the query.`

			`Args:`
			`query: The query string to search for`
			`collection_name: Name of the Qdrant collection to use`
			`top_k: Number of documents to retrieve`

			`Returns:`
			`List of documents with metadata`
			`"""`
			`logger.info(f"Starting document search for query: {query}")`

			`# Create the retriever`
			`retriever = create_retriever(collection_name=collection_name, top_k=top_k)`

			`# Perform the search`
			`results = retriever.invoke(query)`

			`logger.info(f"Search completed, returned {len(results)} documents")`

			`return results`


			`def search_documents_with_metadata(`
			`query: str,`
			`collection_name: str = "documents_langchain",`
			`top_k: int = 5`
			`) -> List[dict]:`
			`"""`
			`Search for documents and return them with detailed metadata.`

			`Args:`
			`query: The query string to search for`
			`collection_name: Name of the Qdrant collection to use`
			`top_k: Number of documents to retrieve`

			`Returns:`
			`List of dictionaries containing document content and metadata`
			`"""`
			`logger.info(f"Starting document search with metadata for query: {query}")`

			`# Initialize the vector store`
			`vector_store = initialize_vector_store(collection_name=collection_name)`

			`try:`
			`# Standard similarity search`
			`documents = vector_store.similarity_search(query, k=top_k)`

			`# Format results to include content and metadata`
			`formatted_results = []`
			`for doc in documents:`
			`formatted_result = {`
			`"content": doc.page_content,`
			`"metadata": doc.metadata,`
			`"source": doc.metadata.get("source", "Unknown"),`
			`"filename": doc.metadata.get("filename", "Unknown"),`
			`"page_number": doc.metadata.get("page_number", doc.metadata.get("page", "N/A")),`
			`"file_extension": doc.metadata.get("file_extension", "N/A"),`
			`"file_size": doc.metadata.get("file_size", "N/A")`
			`}`
			`formatted_results.append(formatted_result)`

			`logger.info(f"Metadata search completed, returned {len(formatted_results)} documents")`

			`return formatted_results`

			`except Exception as e:`
			`logger.error(f"Error during document search with metadata: {str(e)}")`
			`return []`


			`if __name__ == "__main__":`
			`# Example usage`
			`query = "What is the main topic discussed in the documents?"`
			`results = search_documents_with_metadata(query, top_k=5)`

			`print(f"Found {len(results)} documents:")`
			`for i, result in enumerate(results, 1):`
			`print(f"\n{i}. Source: {result['source']}")`
			`print(f" Filename: {result['filename']}")`
			`print(f" Page: {result['page_number']}")`
			`print(f" Content preview: {result['content'][:200]}...")`
			`print(f" Metadata: {result['metadata']}")`