2026-02-04 16:06:01 +03:00
|
|
|
"""
|
|
|
|
|
Document enrichment module for the RAG solution.
|
|
|
|
|
|
|
|
|
|
This module handles loading documents from the data directory,
|
|
|
|
|
processing them with appropriate loaders, splitting them into chunks,
|
|
|
|
|
and storing them in the vector database with proper metadata.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import hashlib
|
2026-02-16 15:12:44 +03:00
|
|
|
import os
|
2026-02-04 16:06:01 +03:00
|
|
|
import sqlite3
|
2026-02-16 15:12:44 +03:00
|
|
|
from datetime import datetime
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
|
|
|
|
from llama_index.core import Document, SimpleDirectoryReader
|
|
|
|
|
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
|
2026-02-04 16:06:01 +03:00
|
|
|
from loguru import logger
|
2026-02-09 19:00:23 +03:00
|
|
|
from tqdm import tqdm
|
2026-02-04 16:06:01 +03:00
|
|
|
|
2026-02-09 19:00:23 +03:00
|
|
|
# Import the new configuration module
|
|
|
|
|
from config import get_embedding_model
|
|
|
|
|
|
2026-02-16 15:12:44 +03:00
|
|
|
# Removed unused import
|
|
|
|
|
from vector_storage import get_vector_store_and_index
|
|
|
|
|
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
class DocumentTracker:
|
|
|
|
|
"""Class to handle tracking of processed documents to avoid re-processing."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, db_path: str = "document_tracking.db"):
|
|
|
|
|
self.db_path = db_path
|
|
|
|
|
self._init_db()
|
|
|
|
|
|
|
|
|
|
def _init_db(self):
|
|
|
|
|
"""Initialize the SQLite database for document tracking."""
|
|
|
|
|
conn = sqlite3.connect(self.db_path)
|
|
|
|
|
cursor = conn.cursor()
|
|
|
|
|
|
|
|
|
|
# Create table for tracking processed documents
|
2026-02-16 15:12:44 +03:00
|
|
|
cursor.execute("""
|
2026-02-04 16:06:01 +03:00
|
|
|
CREATE TABLE IF NOT EXISTS processed_documents (
|
|
|
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
|
filename TEXT UNIQUE NOT NULL,
|
|
|
|
|
filepath TEXT NOT NULL,
|
|
|
|
|
checksum TEXT NOT NULL,
|
|
|
|
|
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
|
|
metadata_json TEXT
|
|
|
|
|
)
|
2026-02-16 15:12:44 +03:00
|
|
|
""")
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
conn.commit()
|
|
|
|
|
conn.close()
|
|
|
|
|
logger.info(f"Document tracker initialized with database: {self.db_path}")
|
|
|
|
|
|
|
|
|
|
def is_document_processed(self, filepath: str) -> bool:
|
|
|
|
|
"""Check if a document has already been processed."""
|
|
|
|
|
conn = sqlite3.connect(self.db_path)
|
|
|
|
|
cursor = conn.cursor()
|
|
|
|
|
|
|
|
|
|
# Calculate checksum of the file
|
|
|
|
|
checksum = self._calculate_checksum(filepath)
|
|
|
|
|
|
|
|
|
|
cursor.execute(
|
|
|
|
|
"SELECT COUNT(*) FROM processed_documents WHERE filepath = ? AND checksum = ?",
|
2026-02-16 15:12:44 +03:00
|
|
|
(filepath, checksum),
|
2026-02-04 16:06:01 +03:00
|
|
|
)
|
|
|
|
|
count = cursor.fetchone()[0]
|
|
|
|
|
|
|
|
|
|
conn.close()
|
|
|
|
|
return count > 0
|
|
|
|
|
|
|
|
|
|
def mark_document_processed(self, filepath: str, metadata: Dict[str, Any] = None):
|
|
|
|
|
"""Mark a document as processed in the database."""
|
|
|
|
|
conn = sqlite3.connect(self.db_path)
|
|
|
|
|
cursor = conn.cursor()
|
|
|
|
|
|
|
|
|
|
checksum = self._calculate_checksum(filepath)
|
|
|
|
|
filename = Path(filepath).name
|
|
|
|
|
|
|
|
|
|
try:
|
2026-02-16 15:12:44 +03:00
|
|
|
cursor.execute(
|
|
|
|
|
"""
|
2026-02-04 16:06:01 +03:00
|
|
|
INSERT OR REPLACE INTO processed_documents
|
|
|
|
|
(filename, filepath, checksum, processed_at, metadata_json)
|
|
|
|
|
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?)
|
2026-02-16 15:12:44 +03:00
|
|
|
""",
|
|
|
|
|
(filename, filepath, checksum, str(metadata) if metadata else None),
|
|
|
|
|
)
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
conn.commit()
|
|
|
|
|
logger.info(f"Document marked as processed: {filepath}")
|
|
|
|
|
except sqlite3.Error as e:
|
|
|
|
|
logger.error(f"Error marking document as processed: {e}")
|
|
|
|
|
finally:
|
|
|
|
|
conn.close()
|
|
|
|
|
|
|
|
|
|
def _calculate_checksum(self, filepath: str) -> str:
|
|
|
|
|
"""Calculate MD5 checksum of a file."""
|
|
|
|
|
hash_md5 = hashlib.md5()
|
|
|
|
|
with open(filepath, "rb") as f:
|
|
|
|
|
# Read file in chunks to handle large files efficiently
|
|
|
|
|
for chunk in iter(lambda: f.read(4096), b""):
|
|
|
|
|
hash_md5.update(chunk)
|
|
|
|
|
return hash_md5.hexdigest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_text_splitter(file_extension: str):
|
|
|
|
|
"""Get appropriate text splitter based on file type."""
|
2026-02-16 15:12:44 +03:00
|
|
|
from llama_index.core.node_parser import (
|
|
|
|
|
CodeSplitter,
|
|
|
|
|
MarkdownElementNodeParser,
|
|
|
|
|
SentenceSplitter,
|
|
|
|
|
TokenTextSplitter,
|
|
|
|
|
)
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
# For code files, use CodeSplitter
|
2026-02-16 15:12:44 +03:00
|
|
|
if file_extension.lower() in [
|
|
|
|
|
".py",
|
|
|
|
|
".js",
|
|
|
|
|
".ts",
|
|
|
|
|
".java",
|
|
|
|
|
".cpp",
|
|
|
|
|
".c",
|
|
|
|
|
".h",
|
|
|
|
|
".cs",
|
|
|
|
|
".go",
|
|
|
|
|
".rs",
|
|
|
|
|
".php",
|
|
|
|
|
".html",
|
|
|
|
|
".css",
|
|
|
|
|
".md",
|
|
|
|
|
".rst",
|
|
|
|
|
]:
|
2026-02-04 16:06:01 +03:00
|
|
|
return CodeSplitter(language="python", max_chars=1000)
|
|
|
|
|
|
|
|
|
|
# For PDF files, use a parser that can handle multi-page documents
|
2026-02-16 15:12:44 +03:00
|
|
|
elif file_extension.lower() == ".pdf":
|
2026-02-04 16:06:01 +03:00
|
|
|
return SentenceSplitter(
|
|
|
|
|
chunk_size=512, # Smaller chunks for dense PDF content
|
2026-02-16 15:12:44 +03:00
|
|
|
chunk_overlap=100,
|
2026-02-04 16:06:01 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# For presentation files (PowerPoint), use smaller chunks
|
2026-02-16 15:12:44 +03:00
|
|
|
elif file_extension.lower() == ".pptx":
|
2026-02-04 16:06:01 +03:00
|
|
|
return SentenceSplitter(
|
|
|
|
|
chunk_size=256, # Slides typically have less text
|
2026-02-16 15:12:44 +03:00
|
|
|
chunk_overlap=50,
|
2026-02-04 16:06:01 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# For spreadsheets, use smaller chunks
|
2026-02-16 15:12:44 +03:00
|
|
|
elif file_extension.lower() == ".xlsx":
|
|
|
|
|
return SentenceSplitter(chunk_size=256, chunk_overlap=50)
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
# For text-heavy documents like Word, use medium-sized chunks
|
2026-02-16 15:12:44 +03:00
|
|
|
elif file_extension.lower() in [".docx", ".odt"]:
|
|
|
|
|
return SentenceSplitter(chunk_size=768, chunk_overlap=150)
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
# For plain text files, use larger chunks
|
2026-02-16 15:12:44 +03:00
|
|
|
elif file_extension.lower() == ".txt":
|
|
|
|
|
return SentenceSplitter(chunk_size=1024, chunk_overlap=200)
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
# For image files, we'll handle them differently (metadata extraction)
|
2026-02-16 15:12:44 +03:00
|
|
|
elif file_extension.lower() in [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg"]:
|
2026-02-04 16:06:01 +03:00
|
|
|
# Images will be handled by multimodal models, return a simple splitter
|
2026-02-16 15:12:44 +03:00
|
|
|
return SentenceSplitter(chunk_size=512, chunk_overlap=100)
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
# For other files, use a standard SentenceSplitter
|
|
|
|
|
else:
|
2026-02-16 15:12:44 +03:00
|
|
|
return SentenceSplitter(chunk_size=768, chunk_overlap=150)
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
|
2026-02-04 16:51:50 +03:00
|
|
|
def ensure_proper_encoding(text):
|
|
|
|
|
"""
|
|
|
|
|
Helper function to ensure proper encoding of text, especially for non-ASCII characters like Cyrillic.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
text: Text that may need encoding correction
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Properly encoded text string
|
|
|
|
|
"""
|
|
|
|
|
if text is None:
|
|
|
|
|
return "unknown"
|
|
|
|
|
|
|
|
|
|
if isinstance(text, bytes):
|
|
|
|
|
# Decode bytes to string with proper encoding
|
|
|
|
|
try:
|
2026-02-16 15:12:44 +03:00
|
|
|
return text.decode("utf-8")
|
2026-02-04 16:51:50 +03:00
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
|
|
|
|
|
try:
|
2026-02-16 15:12:44 +03:00
|
|
|
return text.decode("cp1251") # Windows Cyrillic encoding
|
2026-02-04 16:51:50 +03:00
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
try:
|
2026-02-16 15:12:44 +03:00
|
|
|
return text.decode("koi8-r") # Russian encoding
|
2026-02-04 16:51:50 +03:00
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
# If all else fails, decode with errors='replace'
|
2026-02-16 15:12:44 +03:00
|
|
|
return text.decode("utf-8", errors="replace")
|
2026-02-04 16:51:50 +03:00
|
|
|
elif isinstance(text, str):
|
|
|
|
|
# Ensure the string is properly encoded
|
|
|
|
|
try:
|
|
|
|
|
# Try to encode and decode to ensure it's valid UTF-8
|
2026-02-16 15:12:44 +03:00
|
|
|
return text.encode("utf-8").decode("utf-8")
|
2026-02-04 16:51:50 +03:00
|
|
|
except UnicodeEncodeError:
|
|
|
|
|
# If there are encoding issues, try to fix them
|
2026-02-16 15:12:44 +03:00
|
|
|
return text.encode("utf-8", errors="replace").decode(
|
|
|
|
|
"utf-8", errors="replace"
|
|
|
|
|
)
|
2026-02-04 16:51:50 +03:00
|
|
|
else:
|
|
|
|
|
# Convert other types to string and ensure proper encoding
|
|
|
|
|
text_str = str(text)
|
|
|
|
|
try:
|
2026-02-16 15:12:44 +03:00
|
|
|
return text_str.encode("utf-8").decode("utf-8")
|
2026-02-04 16:51:50 +03:00
|
|
|
except UnicodeEncodeError:
|
2026-02-16 15:12:44 +03:00
|
|
|
return text_str.encode("utf-8", errors="replace").decode(
|
|
|
|
|
"utf-8", errors="replace"
|
|
|
|
|
)
|
2026-02-04 16:51:50 +03:00
|
|
|
|
|
|
|
|
|
2026-02-16 15:12:44 +03:00
|
|
|
def process_documents_from_data_folder(
|
|
|
|
|
data_path: str = "../../../data", recursive: bool = True
|
|
|
|
|
):
|
2026-02-04 16:06:01 +03:00
|
|
|
"""
|
|
|
|
|
Process all documents from the data folder using appropriate loaders and store in vector DB.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
data_path: Path to the data folder relative to current directory
|
|
|
|
|
recursive: Whether to process subdirectories recursively
|
|
|
|
|
"""
|
|
|
|
|
logger.info(f"Starting document enrichment from: {data_path}")
|
|
|
|
|
|
|
|
|
|
# Initialize document tracker
|
|
|
|
|
tracker = DocumentTracker()
|
|
|
|
|
|
|
|
|
|
# Initialize vector storage
|
|
|
|
|
vector_store, index = get_vector_store_and_index()
|
|
|
|
|
|
|
|
|
|
# Get the absolute path to the data directory
|
|
|
|
|
# The data_path is relative to the current working directory
|
|
|
|
|
data_abs_path = Path(data_path)
|
|
|
|
|
|
|
|
|
|
# If the path is relative, resolve it from the current working directory
|
|
|
|
|
if not data_abs_path.is_absolute():
|
|
|
|
|
data_abs_path = Path.cwd() / data_abs_path
|
|
|
|
|
|
|
|
|
|
logger.info(f"Looking for documents in: {data_abs_path.absolute()}")
|
|
|
|
|
|
|
|
|
|
if not data_abs_path.exists():
|
|
|
|
|
logger.error(f"Data directory does not exist: {data_abs_path.absolute()}")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# Find all supported files in the data directory
|
|
|
|
|
supported_extensions = {
|
2026-02-16 15:12:44 +03:00
|
|
|
".pdf",
|
|
|
|
|
".docx",
|
|
|
|
|
".xlsx",
|
|
|
|
|
".pptx",
|
|
|
|
|
".odt",
|
|
|
|
|
".txt",
|
|
|
|
|
".png",
|
|
|
|
|
".jpg",
|
|
|
|
|
".jpeg",
|
|
|
|
|
".gif",
|
|
|
|
|
".bmp",
|
|
|
|
|
".svg",
|
|
|
|
|
".zip",
|
|
|
|
|
".rar",
|
|
|
|
|
".tar",
|
|
|
|
|
".gz",
|
2026-02-04 16:06:01 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Walk through the directory structure
|
|
|
|
|
all_files = []
|
|
|
|
|
if recursive:
|
|
|
|
|
for root, dirs, files in os.walk(data_abs_path):
|
|
|
|
|
for file in files:
|
|
|
|
|
file_ext = Path(file).suffix.lower()
|
|
|
|
|
if file_ext in supported_extensions:
|
|
|
|
|
all_files.append(os.path.join(root, file))
|
|
|
|
|
else:
|
|
|
|
|
for file in data_abs_path.iterdir():
|
|
|
|
|
if file.is_file():
|
|
|
|
|
file_ext = file.suffix.lower()
|
|
|
|
|
if file_ext in supported_extensions:
|
|
|
|
|
all_files.append(str(file))
|
|
|
|
|
|
|
|
|
|
logger.info(f"Found {len(all_files)} files to process")
|
|
|
|
|
|
|
|
|
|
processed_count = 0
|
|
|
|
|
skipped_count = 0
|
|
|
|
|
|
2026-02-09 19:00:23 +03:00
|
|
|
# Initialize progress bar
|
|
|
|
|
pbar = tqdm(total=len(all_files), desc="Processing documents", unit="file")
|
2026-02-16 15:12:44 +03:00
|
|
|
|
2026-02-04 16:06:01 +03:00
|
|
|
for file_path in all_files:
|
2026-02-16 15:12:44 +03:00
|
|
|
logger.info(
|
|
|
|
|
f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})"
|
|
|
|
|
)
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
# Check if document has already been processed
|
|
|
|
|
if tracker.is_document_processed(file_path):
|
|
|
|
|
logger.info(f"Skipping already processed file: {file_path}")
|
|
|
|
|
skipped_count += 1
|
2026-02-09 19:00:23 +03:00
|
|
|
pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})
|
|
|
|
|
pbar.update(1)
|
2026-02-04 16:06:01 +03:00
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Load the document using SimpleDirectoryReader
|
|
|
|
|
# This automatically selects the appropriate reader based on file extension
|
|
|
|
|
def file_metadata_func(file_path_str):
|
2026-02-04 16:51:50 +03:00
|
|
|
# Apply proper encoding to filename
|
|
|
|
|
filename = ensure_proper_encoding(Path(file_path_str).name)
|
|
|
|
|
return {"filename": filename}
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
reader = SimpleDirectoryReader(
|
2026-02-16 15:12:44 +03:00
|
|
|
input_files=[file_path], file_metadata=file_metadata_func
|
2026-02-04 16:06:01 +03:00
|
|
|
)
|
|
|
|
|
documents = reader.load_data()
|
|
|
|
|
|
|
|
|
|
# Process each document
|
|
|
|
|
for doc in documents:
|
|
|
|
|
# Extract additional metadata based on document type
|
|
|
|
|
file_ext = Path(file_path).suffix
|
|
|
|
|
|
2026-02-04 16:51:50 +03:00
|
|
|
# Apply proper encoding to file path
|
|
|
|
|
encoded_file_path = ensure_proper_encoding(file_path)
|
|
|
|
|
|
2026-02-04 16:06:01 +03:00
|
|
|
# Add additional metadata
|
2026-02-04 16:51:50 +03:00
|
|
|
doc.metadata["file_path"] = encoded_file_path
|
2026-02-04 16:06:01 +03:00
|
|
|
doc.metadata["processed_at"] = datetime.now().isoformat()
|
|
|
|
|
|
|
|
|
|
# Handle document-type-specific metadata
|
2026-02-16 15:12:44 +03:00
|
|
|
if file_ext.lower() == ".pdf":
|
2026-02-04 16:06:01 +03:00
|
|
|
# PDF-specific metadata
|
2026-02-16 15:12:44 +03:00
|
|
|
doc.metadata["page_label"] = ensure_proper_encoding(
|
|
|
|
|
doc.metadata.get("page_label", "unknown")
|
|
|
|
|
)
|
2026-02-04 16:06:01 +03:00
|
|
|
doc.metadata["file_type"] = "pdf"
|
|
|
|
|
|
2026-02-16 15:12:44 +03:00
|
|
|
elif file_ext.lower() in [".docx", ".odt"]:
|
2026-02-04 16:06:01 +03:00
|
|
|
# Word document metadata
|
2026-02-16 15:12:44 +03:00
|
|
|
doc.metadata["section"] = ensure_proper_encoding(
|
|
|
|
|
doc.metadata.get("section", "unknown")
|
|
|
|
|
)
|
2026-02-04 16:06:01 +03:00
|
|
|
doc.metadata["file_type"] = "document"
|
|
|
|
|
|
2026-02-16 15:12:44 +03:00
|
|
|
elif file_ext.lower() == ".pptx":
|
2026-02-04 16:06:01 +03:00
|
|
|
# PowerPoint metadata
|
2026-02-16 15:12:44 +03:00
|
|
|
doc.metadata["slide_id"] = ensure_proper_encoding(
|
|
|
|
|
doc.metadata.get("slide_id", "unknown")
|
|
|
|
|
)
|
2026-02-04 16:06:01 +03:00
|
|
|
doc.metadata["file_type"] = "presentation"
|
|
|
|
|
|
2026-02-16 15:12:44 +03:00
|
|
|
elif file_ext.lower() == ".xlsx":
|
2026-02-04 16:06:01 +03:00
|
|
|
# Excel metadata
|
2026-02-16 15:12:44 +03:00
|
|
|
doc.metadata["sheet_name"] = ensure_proper_encoding(
|
|
|
|
|
doc.metadata.get("sheet_name", "unknown")
|
|
|
|
|
)
|
2026-02-04 16:06:01 +03:00
|
|
|
doc.metadata["file_type"] = "spreadsheet"
|
|
|
|
|
|
|
|
|
|
# Determine the appropriate text splitter based on file type
|
|
|
|
|
splitter = get_text_splitter(file_ext)
|
|
|
|
|
|
|
|
|
|
# Split the document into nodes
|
|
|
|
|
nodes = splitter.get_nodes_from_documents([doc])
|
|
|
|
|
|
|
|
|
|
# Insert nodes into the vector index
|
|
|
|
|
nodes_with_enhanced_metadata = []
|
|
|
|
|
for i, node in enumerate(nodes):
|
|
|
|
|
# Enhance node metadata with additional information
|
2026-02-16 15:12:44 +03:00
|
|
|
node.metadata["original_doc_id"] = ensure_proper_encoding(
|
|
|
|
|
doc.doc_id
|
|
|
|
|
)
|
2026-02-04 16:06:01 +03:00
|
|
|
node.metadata["chunk_number"] = i
|
|
|
|
|
node.metadata["total_chunks"] = len(nodes)
|
2026-02-04 16:51:50 +03:00
|
|
|
node.metadata["file_path"] = encoded_file_path
|
|
|
|
|
|
|
|
|
|
# Ensure the text content is properly encoded
|
|
|
|
|
node.text = ensure_proper_encoding(node.text)
|
|
|
|
|
|
2026-02-04 16:06:01 +03:00
|
|
|
nodes_with_enhanced_metadata.append(node)
|
|
|
|
|
|
|
|
|
|
# Add all nodes to the index at once
|
|
|
|
|
if nodes_with_enhanced_metadata:
|
|
|
|
|
index.insert_nodes(nodes_with_enhanced_metadata)
|
|
|
|
|
|
2026-02-04 16:51:50 +03:00
|
|
|
logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
# Mark document as processed only after successful insertion
|
|
|
|
|
tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})
|
|
|
|
|
processed_count += 1
|
2026-02-09 19:00:23 +03:00
|
|
|
pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error processing file {file_path}: {str(e)}")
|
2026-02-16 15:12:44 +03:00
|
|
|
|
2026-02-09 19:00:23 +03:00
|
|
|
# Update progress bar regardless of success or failure
|
|
|
|
|
pbar.update(1)
|
2026-02-04 16:06:01 +03:00
|
|
|
|
2026-02-09 19:00:23 +03:00
|
|
|
pbar.close()
|
2026-02-16 15:12:44 +03:00
|
|
|
logger.info(
|
|
|
|
|
f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}"
|
|
|
|
|
)
|
2026-02-04 16:06:01 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def enrich_documents():
|
|
|
|
|
"""Main function to run the document enrichment process."""
|
|
|
|
|
logger.info("Starting document enrichment process")
|
|
|
|
|
process_documents_from_data_folder()
|
|
|
|
|
logger.info("Document enrichment process completed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
# Example usage
|
|
|
|
|
logger.info("Running document enrichment...")
|
|
|
|
|
enrich_documents()
|