removed test retrieval feature. off you go
This commit is contained in:
@@ -2,10 +2,11 @@
|
||||
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
from loguru import logger
|
||||
|
||||
from vector_storage import initialize_vector_store
|
||||
@@ -18,31 +19,31 @@ class VectorStoreRetriever(BaseRetriever):
|
||||
"""
|
||||
A custom retriever that uses the Qdrant vector store to retrieve relevant documents.
|
||||
"""
|
||||
|
||||
|
||||
vector_store: object # Qdrant vector store instance
|
||||
top_k: int = 5 # Number of documents to retrieve
|
||||
|
||||
|
||||
def _get_relevant_documents(
|
||||
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Retrieve relevant documents based on the query.
|
||||
|
||||
|
||||
Args:
|
||||
query: The query string to search for
|
||||
run_manager: Callback manager for the run
|
||||
|
||||
|
||||
Returns:
|
||||
List of relevant documents with metadata
|
||||
"""
|
||||
logger.info(f"Searching for documents related to query: {query[:50]}...")
|
||||
|
||||
|
||||
try:
|
||||
# Perform similarity search on the vector store
|
||||
results = self.vector_store.similarity_search(query, k=self.top_k)
|
||||
|
||||
|
||||
logger.info(f"Found {len(results)} relevant documents")
|
||||
|
||||
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f"Error during similarity search: {str(e)}")
|
||||
@@ -52,54 +53,29 @@ class VectorStoreRetriever(BaseRetriever):
|
||||
def create_retriever(collection_name: str = "documents_langchain", top_k: int = 5):
|
||||
"""
|
||||
Create and return a retriever instance connected to the vector store.
|
||||
|
||||
|
||||
Args:
|
||||
collection_name: Name of the Qdrant collection to use
|
||||
top_k: Number of documents to retrieve
|
||||
|
||||
|
||||
Returns:
|
||||
VectorStoreRetriever instance
|
||||
"""
|
||||
logger.info(f"Initializing vector store for retrieval from collection: {collection_name}")
|
||||
|
||||
logger.info(
|
||||
f"Initializing vector store for retrieval from collection: {collection_name}"
|
||||
)
|
||||
|
||||
# Initialize the vector store
|
||||
vector_store = initialize_vector_store(collection_name=collection_name)
|
||||
|
||||
|
||||
# Create and return the retriever
|
||||
retriever = VectorStoreRetriever(vector_store=vector_store, top_k=top_k)
|
||||
|
||||
|
||||
return retriever
|
||||
|
||||
|
||||
def search_documents(query: str, collection_name: str = "documents_langchain", top_k: int = 5) -> List[Document]:
|
||||
"""
|
||||
Search for documents in the vector store based on the query.
|
||||
|
||||
Args:
|
||||
query: The query string to search for
|
||||
collection_name: Name of the Qdrant collection to use
|
||||
top_k: Number of documents to retrieve
|
||||
|
||||
Returns:
|
||||
List of documents with metadata
|
||||
"""
|
||||
logger.info(f"Starting document search for query: {query}")
|
||||
|
||||
# Create the retriever
|
||||
retriever = create_retriever(collection_name=collection_name, top_k=top_k)
|
||||
|
||||
# Perform the search
|
||||
results = retriever.invoke(query)
|
||||
|
||||
logger.info(f"Search completed, returned {len(results)} documents")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def search_documents_with_metadata(
|
||||
query: str,
|
||||
collection_name: str = "documents_langchain",
|
||||
top_k: int = 5
|
||||
query: str, collection_name: str = "documents_langchain", top_k: int = 5
|
||||
) -> List[dict]:
|
||||
"""
|
||||
Search for documents and return them with detailed metadata.
|
||||
@@ -129,30 +105,20 @@ def search_documents_with_metadata(
|
||||
"metadata": doc.metadata,
|
||||
"source": doc.metadata.get("source", "Unknown"),
|
||||
"filename": doc.metadata.get("filename", "Unknown"),
|
||||
"page_number": doc.metadata.get("page_number", doc.metadata.get("page", "N/A")),
|
||||
"page_number": doc.metadata.get(
|
||||
"page_number", doc.metadata.get("page", "N/A")
|
||||
),
|
||||
"file_extension": doc.metadata.get("file_extension", "N/A"),
|
||||
"file_size": doc.metadata.get("file_size", "N/A")
|
||||
"file_size": doc.metadata.get("file_size", "N/A"),
|
||||
}
|
||||
formatted_results.append(formatted_result)
|
||||
|
||||
logger.info(f"Metadata search completed, returned {len(formatted_results)} documents")
|
||||
logger.info(
|
||||
f"Metadata search completed, returned {len(formatted_results)} documents"
|
||||
)
|
||||
|
||||
return formatted_results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during document search with metadata: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
query = "What is the main topic discussed in the documents?"
|
||||
results = search_documents_with_metadata(query, top_k=5)
|
||||
|
||||
print(f"Found {len(results)} documents:")
|
||||
for i, result in enumerate(results, 1):
|
||||
print(f"\n{i}. Source: {result['source']}")
|
||||
print(f" Filename: {result['filename']}")
|
||||
print(f" Page: {result['page_number']}")
|
||||
print(f" Content preview: {result['content'][:200]}...")
|
||||
print(f" Metadata: {result['metadata']}")
|
||||
Reference in New Issue
Block a user