enrichment with years, events

This commit is contained in:
2026-02-10 13:20:19 +03:00
parent ce62fd50ed
commit 447ecaba39
5 changed files with 267 additions and 7 deletions

View File

@@ -1,14 +1,16 @@
"""Retrieval module for querying vector storage and returning relevant documents with metadata."""
import os
from typing import List, Optional
from typing import List
from dotenv import load_dotenv
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from loguru import logger
from qdrant_client.http.models import FieldCondition, Filter, MatchAny
from helpers import extract_russian_event_names, extract_years_from_text
from vector_storage import initialize_vector_store
# Load environment variables
@@ -23,6 +25,91 @@ class VectorStoreRetriever(BaseRetriever):
vector_store: object # Qdrant vector store instance
top_k: int = 5 # Number of documents to retrieve
def _build_qdrant_filter(
self, years: List[int], events: List[str]
) -> Filter | None:
"""Build a Qdrant payload filter for extracted years and events."""
conditions: List[FieldCondition] = []
if years:
conditions.extend(
[
FieldCondition(
key="metadata.years",
match=MatchAny(any=years),
),
FieldCondition(
key="years",
match=MatchAny(any=years),
),
]
)
if events:
conditions.extend(
[
FieldCondition(
key="metadata.events",
match=MatchAny(any=events),
),
FieldCondition(
key="events",
match=MatchAny(any=events),
),
]
)
if not conditions:
return None
return Filter(should=conditions)
@staticmethod
def _post_filter_documents(
documents: List[Document], years: List[int], events: List[str]
) -> List[Document]:
"""Fallback filter in Python in case vector DB filter cannot be applied."""
if not years and not events:
return documents
year_set = set(years)
event_set = set(events)
filtered: List[Document] = []
for doc in documents:
metadata = doc.metadata or {}
doc_years = {
int(year)
for year in metadata.get("years", [])
if isinstance(year, int) or (isinstance(year, str) and year.isdigit())
}
doc_events = {str(event).lower() for event in metadata.get("events", [])}
year_match = not year_set or bool(doc_years.intersection(year_set))
event_match = not event_set or bool(doc_events.intersection(event_set))
if year_match and event_match:
filtered.append(doc)
return filtered
@staticmethod
def _merge_unique_documents(documents: List[Document]) -> List[Document]:
"""Deduplicate documents while preserving order."""
unique_docs: List[Document] = []
seen = set()
for doc in documents:
dedup_key = (
doc.metadata.get("source", ""),
doc.metadata.get("page_number", doc.metadata.get("page", "")),
doc.page_content[:200],
)
if dedup_key in seen:
continue
seen.add(dedup_key)
unique_docs.append(doc)
return unique_docs
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
@@ -39,8 +126,54 @@ class VectorStoreRetriever(BaseRetriever):
logger.info(f"Searching for documents related to query: {query[:50]}...")
try:
# Perform similarity search on the vector store
results = self.vector_store.similarity_search(query, k=self.top_k)
years_in_query = extract_years_from_text(query)
events_in_query = extract_russian_event_names(query)
search_filter = self._build_qdrant_filter(years_in_query, events_in_query)
logger.info(
f"Extracted query metadata for retrieval: years={years_in_query}, events={events_in_query}"
)
# Main search by original user query.
search_k = max(self.top_k * 3, self.top_k)
if search_filter is not None:
try:
results = self.vector_store.similarity_search(
query, k=search_k, filter=search_filter
)
except Exception as filter_error:
logger.warning(
f"Vector store filter failed, fallback to unfiltered search: {filter_error}"
)
results = self.vector_store.similarity_search(query, k=search_k)
results = self._post_filter_documents(
results, years_in_query, events_in_query
)
else:
results = self.vector_store.similarity_search(query, k=search_k)
# Additional event-focused similarity search if event names are present.
if events_in_query:
event_results: List[Document] = []
for event_name in events_in_query:
try:
if search_filter is not None:
event_docs = self.vector_store.similarity_search(
event_name, k=self.top_k, filter=search_filter
)
else:
event_docs = self.vector_store.similarity_search(
event_name, k=self.top_k
)
except Exception as event_search_error:
logger.warning(
f"Event-focused search failed for '{event_name}': {event_search_error}"
)
continue
event_results.extend(event_docs)
results.extend(event_results)
results = self._merge_unique_documents(results)[: self.top_k]
logger.info(f"Found {len(results)} relevant documents")