enrichment with years, events

This commit is contained in:
2026-02-10 13:20:19 +03:00
parent ce62fd50ed
commit 447ecaba39
5 changed files with 267 additions and 7 deletions

View File

@@ -39,6 +39,8 @@ from sqlalchemy.orm import sessionmaker
from loguru import logger
import sqlite3
from helpers import extract_russian_event_names, extract_years_from_text
# Load environment variables
load_dotenv()
@@ -189,6 +191,13 @@ class DocumentEnricher:
# Split documents if they are too large
split_docs = self.text_splitter.split_documents(docs)
# Extract additional metadata from each chunk.
for chunk in split_docs:
years = extract_years_from_text(chunk.page_content)
events = extract_russian_event_names(chunk.page_content)
chunk.metadata["years"] = years
chunk.metadata["events"] = events
# Add to the collection
all_docs.extend(split_docs)
@@ -277,4 +286,4 @@ if __name__ == "__main__":
vector_store = initialize_vector_store()
# Run enrichment process
run_enrichment_process(vector_store)
run_enrichment_process(vector_store)