rag-solution/services/rag/llamaindex/enrichment.py

"""
Document enrichment module for the RAG solution.

This module handles loading documents from the data directory,
processing them with appropriate loaders, splitting them into chunks,
and storing them in the vector database with proper metadata.
"""

import os
import hashlib
from pathlib import Path
from typing import List, Dict, Any
from datetime import datetime
import sqlite3
from loguru import logger
from tqdm import tqdm

from llama_index.core import SimpleDirectoryReader, Document
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter
# Removed unused import

from vector_storage import get_vector_store_and_index

# Import the new configuration module
from config import get_embedding_model


class DocumentTracker:
    """Class to handle tracking of processed documents to avoid re-processing."""

    def __init__(self, db_path: str = "document_tracking.db"):
        self.db_path = db_path
        self._init_db()

    def _init_db(self):
        """Initialize the SQLite database for document tracking."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()

        # Create table for tracking processed documents
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS processed_documents (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                filename TEXT UNIQUE NOT NULL,
                filepath TEXT NOT NULL,
                checksum TEXT NOT NULL,
                processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                metadata_json TEXT
            )
        ''')

        conn.commit()
        conn.close()
        logger.info(f"Document tracker initialized with database: {self.db_path}")

    def is_document_processed(self, filepath: str) -> bool:
        """Check if a document has already been processed."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()

        # Calculate checksum of the file
        checksum = self._calculate_checksum(filepath)

        cursor.execute(
            "SELECT COUNT(*) FROM processed_documents WHERE filepath = ? AND checksum = ?",
            (filepath, checksum)
        )
        count = cursor.fetchone()[0]

        conn.close()
        return count > 0

    def mark_document_processed(self, filepath: str, metadata: Dict[str, Any] = None):
        """Mark a document as processed in the database."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()

        checksum = self._calculate_checksum(filepath)
        filename = Path(filepath).name

        try:
            cursor.execute('''
                INSERT OR REPLACE INTO processed_documents
                (filename, filepath, checksum, processed_at, metadata_json)
                VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?)
            ''', (filename, filepath, checksum, str(metadata) if metadata else None))

            conn.commit()
            logger.info(f"Document marked as processed: {filepath}")
        except sqlite3.Error as e:
            logger.error(f"Error marking document as processed: {e}")
        finally:
            conn.close()

    def _calculate_checksum(self, filepath: str) -> str:
        """Calculate MD5 checksum of a file."""
        hash_md5 = hashlib.md5()
        with open(filepath, "rb") as f:
            # Read file in chunks to handle large files efficiently
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()


def get_text_splitter(file_extension: str):
    """Get appropriate text splitter based on file type."""
    from llama_index.core.node_parser import SentenceSplitter, CodeSplitter, TokenTextSplitter
    from llama_index.core.node_parser import MarkdownElementNodeParser

    # For code files, use CodeSplitter
    if file_extension.lower() in ['.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.go', '.rs', '.php', '.html', '.css', '.md', '.rst']:
        return CodeSplitter(language="python", max_chars=1000)

    # For PDF files, use a parser that can handle multi-page documents
    elif file_extension.lower() == '.pdf':
        return SentenceSplitter(
            chunk_size=512,  # Smaller chunks for dense PDF content
            chunk_overlap=100
        )

    # For presentation files (PowerPoint), use smaller chunks
    elif file_extension.lower() == '.pptx':
        return SentenceSplitter(
            chunk_size=256,  # Slides typically have less text
            chunk_overlap=50
        )

    # For spreadsheets, use smaller chunks
    elif file_extension.lower() == '.xlsx':
        return SentenceSplitter(
            chunk_size=256,
            chunk_overlap=50
        )

    # For text-heavy documents like Word, use medium-sized chunks
    elif file_extension.lower() in ['.docx', '.odt']:
        return SentenceSplitter(
            chunk_size=768,
            chunk_overlap=150
        )

    # For plain text files, use larger chunks
    elif file_extension.lower() == '.txt':
        return SentenceSplitter(
            chunk_size=1024,
            chunk_overlap=200
        )

    # For image files, we'll handle them differently (metadata extraction)
    elif file_extension.lower() in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg']:
        # Images will be handled by multimodal models, return a simple splitter
        return SentenceSplitter(
            chunk_size=512,
            chunk_overlap=100
        )

    # For other files, use a standard SentenceSplitter
    else:
        return SentenceSplitter(
            chunk_size=768,
            chunk_overlap=150
        )


def ensure_proper_encoding(text):
    """
    Helper function to ensure proper encoding of text, especially for non-ASCII characters like Cyrillic.

    Args:
        text: Text that may need encoding correction

    Returns:
        Properly encoded text string
    """
    if text is None:
        return "unknown"

    if isinstance(text, bytes):
        # Decode bytes to string with proper encoding
        try:
            return text.decode('utf-8')
        except UnicodeDecodeError:
            # If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
            try:
                return text.decode('cp1251')  # Windows Cyrillic encoding
            except UnicodeDecodeError:
                try:
                    return text.decode('koi8-r')  # Russian encoding
                except UnicodeDecodeError:
                    # If all else fails, decode with errors='replace'
                    return text.decode('utf-8', errors='replace')
    elif isinstance(text, str):
        # Ensure the string is properly encoded
        try:
            # Try to encode and decode to ensure it's valid UTF-8
            return text.encode('utf-8').decode('utf-8')
        except UnicodeEncodeError:
            # If there are encoding issues, try to fix them
            return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
    else:
        # Convert other types to string and ensure proper encoding
        text_str = str(text)
        try:
            return text_str.encode('utf-8').decode('utf-8')
        except UnicodeEncodeError:
            return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')


def process_documents_from_data_folder(data_path: str = "../../../data", recursive: bool = True):
    """
    Process all documents from the data folder using appropriate loaders and store in vector DB.

    Args:
        data_path: Path to the data folder relative to current directory
        recursive: Whether to process subdirectories recursively
    """
    logger.info(f"Starting document enrichment from: {data_path}")

    # Initialize document tracker
    tracker = DocumentTracker()

    # Initialize vector storage
    vector_store, index = get_vector_store_and_index()

    # Get the absolute path to the data directory
    # The data_path is relative to the current working directory
    data_abs_path = Path(data_path)

    # If the path is relative, resolve it from the current working directory
    if not data_abs_path.is_absolute():
        data_abs_path = Path.cwd() / data_abs_path

    logger.info(f"Looking for documents in: {data_abs_path.absolute()}")

    if not data_abs_path.exists():
        logger.error(f"Data directory does not exist: {data_abs_path.absolute()}")
        return

    # Find all supported files in the data directory
    supported_extensions = {
        '.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.txt',
        '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg',
        '.zip', '.rar', '.tar', '.gz'
    }

    # Walk through the directory structure
    all_files = []
    if recursive:
        for root, dirs, files in os.walk(data_abs_path):
            for file in files:
                file_ext = Path(file).suffix.lower()
                if file_ext in supported_extensions:
                    all_files.append(os.path.join(root, file))
    else:
        for file in data_abs_path.iterdir():
            if file.is_file():
                file_ext = file.suffix.lower()
                if file_ext in supported_extensions:
                    all_files.append(str(file))

    logger.info(f"Found {len(all_files)} files to process")

    processed_count = 0
    skipped_count = 0

    # Initialize progress bar
    pbar = tqdm(total=len(all_files), desc="Processing documents", unit="file")
    
    for file_path in all_files:
        logger.info(f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})")

        # Check if document has already been processed
        if tracker.is_document_processed(file_path):
            logger.info(f"Skipping already processed file: {file_path}")
            skipped_count += 1
            pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})
            pbar.update(1)
            continue

        try:
            # Load the document using SimpleDirectoryReader
            # This automatically selects the appropriate reader based on file extension
            def file_metadata_func(file_path_str):
                # Apply proper encoding to filename
                filename = ensure_proper_encoding(Path(file_path_str).name)
                return {"filename": filename}

            reader = SimpleDirectoryReader(
                input_files=[file_path],
                file_metadata=file_metadata_func
            )
            documents = reader.load_data()

            # Process each document
            for doc in documents:
                # Extract additional metadata based on document type
                file_ext = Path(file_path).suffix

                # Apply proper encoding to file path
                encoded_file_path = ensure_proper_encoding(file_path)

                # Add additional metadata
                doc.metadata["file_path"] = encoded_file_path
                doc.metadata["processed_at"] = datetime.now().isoformat()

                # Handle document-type-specific metadata
                if file_ext.lower() == '.pdf':
                    # PDF-specific metadata
                    doc.metadata["page_label"] = ensure_proper_encoding(doc.metadata.get("page_label", "unknown"))
                    doc.metadata["file_type"] = "pdf"

                elif file_ext.lower() in ['.docx', '.odt']:
                    # Word document metadata
                    doc.metadata["section"] = ensure_proper_encoding(doc.metadata.get("section", "unknown"))
                    doc.metadata["file_type"] = "document"

                elif file_ext.lower() == '.pptx':
                    # PowerPoint metadata
                    doc.metadata["slide_id"] = ensure_proper_encoding(doc.metadata.get("slide_id", "unknown"))
                    doc.metadata["file_type"] = "presentation"

                elif file_ext.lower() == '.xlsx':
                    # Excel metadata
                    doc.metadata["sheet_name"] = ensure_proper_encoding(doc.metadata.get("sheet_name", "unknown"))
                    doc.metadata["file_type"] = "spreadsheet"

                # Determine the appropriate text splitter based on file type
                splitter = get_text_splitter(file_ext)

                # Split the document into nodes
                nodes = splitter.get_nodes_from_documents([doc])

                # Insert nodes into the vector index
                nodes_with_enhanced_metadata = []
                for i, node in enumerate(nodes):
                    # Enhance node metadata with additional information
                    node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
                    node.metadata["chunk_number"] = i
                    node.metadata["total_chunks"] = len(nodes)
                    node.metadata["file_path"] = encoded_file_path

                    # Ensure the text content is properly encoded
                    node.text = ensure_proper_encoding(node.text)

                    nodes_with_enhanced_metadata.append(node)

                # Add all nodes to the index at once
                if nodes_with_enhanced_metadata:
                    index.insert_nodes(nodes_with_enhanced_metadata)

                logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")

            # Mark document as processed only after successful insertion
            tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})
            processed_count += 1
            pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})

        except Exception as e:
            logger.error(f"Error processing file {file_path}: {str(e)}")
        
        # Update progress bar regardless of success or failure
        pbar.update(1)

    pbar.close()
    logger.info(f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}")


def enrich_documents():
    """Main function to run the document enrichment process."""
    logger.info("Starting document enrichment process")
    process_documents_from_data_folder()
    logger.info("Document enrichment process completed")


if __name__ == "__main__":
    # Example usage
    logger.info("Running document enrichment...")
    enrich_documents()
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`"""`
			`Document enrichment module for the RAG solution.`

			`This module handles loading documents from the data directory,`
			`processing them with appropriate loaders, splitting them into chunks,`
			`and storing them in the vector database with proper metadata.`
			`"""`

			`import os`
			`import hashlib`
			`from pathlib import Path`
			`from typing import List, Dict, Any`
			`from datetime import datetime`
			`import sqlite3`
			`from loguru import logger`
llamaindex update + unpacking archives in data 2026-02-09 19:00:23 +03:00			`from tqdm import tqdm`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00
			`from llama_index.core import SimpleDirectoryReader, Document`
			`from llama_index.core.node_parser import SentenceSplitter, CodeSplitter`
			`# Removed unused import`

			`from vector_storage import get_vector_store_and_index`

llamaindex update + unpacking archives in data 2026-02-09 19:00:23 +03:00			`# Import the new configuration module`
			`from config import get_embedding_model`

Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00
			`class DocumentTracker:`
			`"""Class to handle tracking of processed documents to avoid re-processing."""`

			`def __init__(self, db_path: str = "document_tracking.db"):`
			`self.db_path = db_path`
			`self._init_db()`

			`def _init_db(self):`
			`"""Initialize the SQLite database for document tracking."""`
			`conn = sqlite3.connect(self.db_path)`
			`cursor = conn.cursor()`

			`# Create table for tracking processed documents`
			`cursor.execute('''`
			`CREATE TABLE IF NOT EXISTS processed_documents (`
			`id INTEGER PRIMARY KEY AUTOINCREMENT,`
			`filename TEXT UNIQUE NOT NULL,`
			`filepath TEXT NOT NULL,`
			`checksum TEXT NOT NULL,`
			`processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,`
			`metadata_json TEXT`
			`)`
			`''')`

			`conn.commit()`
			`conn.close()`
			`logger.info(f"Document tracker initialized with database: {self.db_path}")`

			`def is_document_processed(self, filepath: str) -> bool:`
			`"""Check if a document has already been processed."""`
			`conn = sqlite3.connect(self.db_path)`
			`cursor = conn.cursor()`

			`# Calculate checksum of the file`
			`checksum = self._calculate_checksum(filepath)`

			`cursor.execute(`
			`"SELECT COUNT(*) FROM processed_documents WHERE filepath = ? AND checksum = ?",`
			`(filepath, checksum)`
			`)`
			`count = cursor.fetchone()[0]`

			`conn.close()`
			`return count > 0`

			`def mark_document_processed(self, filepath: str, metadata: Dict[str, Any] = None):`
			`"""Mark a document as processed in the database."""`
			`conn = sqlite3.connect(self.db_path)`
			`cursor = conn.cursor()`

			`checksum = self._calculate_checksum(filepath)`
			`filename = Path(filepath).name`

			`try:`
			`cursor.execute('''`
			`INSERT OR REPLACE INTO processed_documents`
			`(filename, filepath, checksum, processed_at, metadata_json)`
			`VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?)`
			`''', (filename, filepath, checksum, str(metadata) if metadata else None))`

			`conn.commit()`
			`logger.info(f"Document marked as processed: {filepath}")`
			`except sqlite3.Error as e:`
			`logger.error(f"Error marking document as processed: {e}")`
			`finally:`
			`conn.close()`

			`def _calculate_checksum(self, filepath: str) -> str:`
			`"""Calculate MD5 checksum of a file."""`
			`hash_md5 = hashlib.md5()`
			`with open(filepath, "rb") as f:`
			`# Read file in chunks to handle large files efficiently`
			`for chunk in iter(lambda: f.read(4096), b""):`
			`hash_md5.update(chunk)`
			`return hash_md5.hexdigest()`


			`def get_text_splitter(file_extension: str):`
			`"""Get appropriate text splitter based on file type."""`
			`from llama_index.core.node_parser import SentenceSplitter, CodeSplitter, TokenTextSplitter`
			`from llama_index.core.node_parser import MarkdownElementNodeParser`

			`# For code files, use CodeSplitter`
			`if file_extension.lower() in ['.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.go', '.rs', '.php', '.html', '.css', '.md', '.rst']:`
			`return CodeSplitter(language="python", max_chars=1000)`

			`# For PDF files, use a parser that can handle multi-page documents`
			`elif file_extension.lower() == '.pdf':`
			`return SentenceSplitter(`
			`chunk_size=512, # Smaller chunks for dense PDF content`
			`chunk_overlap=100`
			`)`

			`# For presentation files (PowerPoint), use smaller chunks`
			`elif file_extension.lower() == '.pptx':`
			`return SentenceSplitter(`
			`chunk_size=256, # Slides typically have less text`
			`chunk_overlap=50`
			`)`

			`# For spreadsheets, use smaller chunks`
			`elif file_extension.lower() == '.xlsx':`
			`return SentenceSplitter(`
			`chunk_size=256,`
			`chunk_overlap=50`
			`)`

			`# For text-heavy documents like Word, use medium-sized chunks`
			`elif file_extension.lower() in ['.docx', '.odt']:`
			`return SentenceSplitter(`
			`chunk_size=768,`
			`chunk_overlap=150`
			`)`

			`# For plain text files, use larger chunks`
			`elif file_extension.lower() == '.txt':`
			`return SentenceSplitter(`
			`chunk_size=1024,`
			`chunk_overlap=200`
			`)`

			`# For image files, we'll handle them differently (metadata extraction)`
			`elif file_extension.lower() in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg']:`
			`# Images will be handled by multimodal models, return a simple splitter`
			`return SentenceSplitter(`
			`chunk_size=512,`
			`chunk_overlap=100`
			`)`

			`# For other files, use a standard SentenceSplitter`
			`else:`
			`return SentenceSplitter(`
			`chunk_size=768,`
			`chunk_overlap=150`
			`)`


Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`def ensure_proper_encoding(text):`
			`"""`
			`Helper function to ensure proper encoding of text, especially for non-ASCII characters like Cyrillic.`

			`Args:`
			`text: Text that may need encoding correction`

			`Returns:`
			`Properly encoded text string`
			`"""`
			`if text is None:`
			`return "unknown"`

			`if isinstance(text, bytes):`
			`# Decode bytes to string with proper encoding`
			`try:`
			`return text.decode('utf-8')`
			`except UnicodeDecodeError:`
			`# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text`
			`try:`
			`return text.decode('cp1251') # Windows Cyrillic encoding`
			`except UnicodeDecodeError:`
			`try:`
			`return text.decode('koi8-r') # Russian encoding`
			`except UnicodeDecodeError:`
			`# If all else fails, decode with errors='replace'`
			`return text.decode('utf-8', errors='replace')`
			`elif isinstance(text, str):`
			`# Ensure the string is properly encoded`
			`try:`
			`# Try to encode and decode to ensure it's valid UTF-8`
			`return text.encode('utf-8').decode('utf-8')`
			`except UnicodeEncodeError:`
			`# If there are encoding issues, try to fix them`
			`return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')`
			`else:`
			`# Convert other types to string and ensure proper encoding`
			`text_str = str(text)`
			`try:`
			`return text_str.encode('utf-8').decode('utf-8')`
			`except UnicodeEncodeError:`
			`return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')`


Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`def process_documents_from_data_folder(data_path: str = "../../../data", recursive: bool = True):`
			`"""`
			`Process all documents from the data folder using appropriate loaders and store in vector DB.`

			`Args:`
			`data_path: Path to the data folder relative to current directory`
			`recursive: Whether to process subdirectories recursively`
			`"""`
			`logger.info(f"Starting document enrichment from: {data_path}")`

			`# Initialize document tracker`
			`tracker = DocumentTracker()`

			`# Initialize vector storage`
			`vector_store, index = get_vector_store_and_index()`

			`# Get the absolute path to the data directory`
			`# The data_path is relative to the current working directory`
			`data_abs_path = Path(data_path)`

			`# If the path is relative, resolve it from the current working directory`
			`if not data_abs_path.is_absolute():`
			`data_abs_path = Path.cwd() / data_abs_path`

			`logger.info(f"Looking for documents in: {data_abs_path.absolute()}")`

			`if not data_abs_path.exists():`
			`logger.error(f"Data directory does not exist: {data_abs_path.absolute()}")`
			`return`

			`# Find all supported files in the data directory`
			`supported_extensions = {`
			`'.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.txt',`
			`'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg',`
			`'.zip', '.rar', '.tar', '.gz'`
			`}`

			`# Walk through the directory structure`
			`all_files = []`
			`if recursive:`
			`for root, dirs, files in os.walk(data_abs_path):`
			`for file in files:`
			`file_ext = Path(file).suffix.lower()`
			`if file_ext in supported_extensions:`
			`all_files.append(os.path.join(root, file))`
			`else:`
			`for file in data_abs_path.iterdir():`
			`if file.is_file():`
			`file_ext = file.suffix.lower()`
			`if file_ext in supported_extensions:`
			`all_files.append(str(file))`

			`logger.info(f"Found {len(all_files)} files to process")`

			`processed_count = 0`
			`skipped_count = 0`

llamaindex update + unpacking archives in data 2026-02-09 19:00:23 +03:00			`# Initialize progress bar`
			`pbar = tqdm(total=len(all_files), desc="Processing documents", unit="file")`

Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`for file_path in all_files:`
llamaindex update + unpacking archives in data 2026-02-09 19:00:23 +03:00			`logger.info(f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})")`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00
			`# Check if document has already been processed`
			`if tracker.is_document_processed(file_path):`
			`logger.info(f"Skipping already processed file: {file_path}")`
			`skipped_count += 1`
llamaindex update + unpacking archives in data 2026-02-09 19:00:23 +03:00			`pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})`
			`pbar.update(1)`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`continue`

			`try:`
			`# Load the document using SimpleDirectoryReader`
			`# This automatically selects the appropriate reader based on file extension`
			`def file_metadata_func(file_path_str):`
Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`# Apply proper encoding to filename`
			`filename = ensure_proper_encoding(Path(file_path_str).name)`
			`return {"filename": filename}`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00
			`reader = SimpleDirectoryReader(`
			`input_files=[file_path],`
			`file_metadata=file_metadata_func`
			`)`
			`documents = reader.load_data()`

			`# Process each document`
			`for doc in documents:`
			`# Extract additional metadata based on document type`
			`file_ext = Path(file_path).suffix`

Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`# Apply proper encoding to file path`
			`encoded_file_path = ensure_proper_encoding(file_path)`

Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`# Add additional metadata`
Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`doc.metadata["file_path"] = encoded_file_path`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`doc.metadata["processed_at"] = datetime.now().isoformat()`

			`# Handle document-type-specific metadata`
			`if file_ext.lower() == '.pdf':`
			`# PDF-specific metadata`
Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`doc.metadata["page_label"] = ensure_proper_encoding(doc.metadata.get("page_label", "unknown"))`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`doc.metadata["file_type"] = "pdf"`

			`elif file_ext.lower() in ['.docx', '.odt']:`
			`# Word document metadata`
Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`doc.metadata["section"] = ensure_proper_encoding(doc.metadata.get("section", "unknown"))`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`doc.metadata["file_type"] = "document"`

			`elif file_ext.lower() == '.pptx':`
			`# PowerPoint metadata`
Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`doc.metadata["slide_id"] = ensure_proper_encoding(doc.metadata.get("slide_id", "unknown"))`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`doc.metadata["file_type"] = "presentation"`

			`elif file_ext.lower() == '.xlsx':`
			`# Excel metadata`
Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`doc.metadata["sheet_name"] = ensure_proper_encoding(doc.metadata.get("sheet_name", "unknown"))`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`doc.metadata["file_type"] = "spreadsheet"`

			`# Determine the appropriate text splitter based on file type`
			`splitter = get_text_splitter(file_ext)`

			`# Split the document into nodes`
			`nodes = splitter.get_nodes_from_documents([doc])`

			`# Insert nodes into the vector index`
			`nodes_with_enhanced_metadata = []`
			`for i, node in enumerate(nodes):`
			`# Enhance node metadata with additional information`
Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`node.metadata["chunk_number"] = i`
			`node.metadata["total_chunks"] = len(nodes)`
Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`node.metadata["file_path"] = encoded_file_path`

			`# Ensure the text content is properly encoded`
			`node.text = ensure_proper_encoding(node.text)`

Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`nodes_with_enhanced_metadata.append(node)`

			`# Add all nodes to the index at once`
			`if nodes_with_enhanced_metadata:`
			`index.insert_nodes(nodes_with_enhanced_metadata)`

Retrieval and also update on russian language 2026-02-04 16:51:50 +03:00			`logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00
			`# Mark document as processed only after successful insertion`
			`tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})`
			`processed_count += 1`
llamaindex update + unpacking archives in data 2026-02-09 19:00:23 +03:00			`pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00
			`except Exception as e:`
			`logger.error(f"Error processing file {file_path}: {str(e)}")`
llamaindex update + unpacking archives in data 2026-02-09 19:00:23 +03:00
			`# Update progress bar regardless of success or failure`
			`pbar.update(1)`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00
llamaindex update + unpacking archives in data 2026-02-09 19:00:23 +03:00			`pbar.close()`
Enrichment for llamaindex. It goes for a long time using local model, so better use external model not local, for EMBEDDING 2026-02-04 16:06:01 +03:00			`logger.info(f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}")`


			`def enrich_documents():`
			`"""Main function to run the document enrichment process."""`
			`logger.info("Starting document enrichment process")`
			`process_documents_from_data_folder()`
			`logger.info("Document enrichment process completed")`


			`if __name__ == "__main__":`
			`# Example usage`
			`logger.info("Running document enrichment...")`
			`enrich_documents()`