Retrieval and also update on russian language

2026-02-04 16:51:50 +03:00
parent 3dea3605ad
commit ea4ce23cd9
7 changed files with 572 additions and 17 deletions
--- a/services/rag/llamaindex/enrichment.py
+++ b/services/rag/llamaindex/enrichment.py
@@ -158,6 +158,50 @@ def get_text_splitter(file_extension: str):
        )


+def ensure_proper_encoding(text):
+    """
+    Helper function to ensure proper encoding of text, especially for non-ASCII characters like Cyrillic.
+
+    Args:
+        text: Text that may need encoding correction
+
+    Returns:
+        Properly encoded text string
+    """
+    if text is None:
+        return "unknown"
+
+    if isinstance(text, bytes):
+        # Decode bytes to string with proper encoding
+        try:
+            return text.decode('utf-8')
+        except UnicodeDecodeError:
+            # If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
+            try:
+                return text.decode('cp1251')  # Windows Cyrillic encoding
+            except UnicodeDecodeError:
+                try:
+                    return text.decode('koi8-r')  # Russian encoding
+                except UnicodeDecodeError:
+                    # If all else fails, decode with errors='replace'
+                    return text.decode('utf-8', errors='replace')
+    elif isinstance(text, str):
+        # Ensure the string is properly encoded
+        try:
+            # Try to encode and decode to ensure it's valid UTF-8
+            return text.encode('utf-8').decode('utf-8')
+        except UnicodeEncodeError:
+            # If there are encoding issues, try to fix them
+            return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
+    else:
+        # Convert other types to string and ensure proper encoding
+        text_str = str(text)
+        try:
+            return text_str.encode('utf-8').decode('utf-8')
+        except UnicodeEncodeError:
+            return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
+
+
 def process_documents_from_data_folder(data_path: str = "../../../data", recursive: bool = True):
    """
    Process all documents from the data folder using appropriate loaders and store in vector DB.
@@ -228,7 +272,9 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
            # Load the document using SimpleDirectoryReader
            # This automatically selects the appropriate reader based on file extension
            def file_metadata_func(file_path_str):
-                return {"filename": Path(file_path_str).name}
+                # Apply proper encoding to filename
+                filename = ensure_proper_encoding(Path(file_path_str).name)
+                return {"filename": filename}

            reader = SimpleDirectoryReader(
                input_files=[file_path],
@@ -241,29 +287,32 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
                # Extract additional metadata based on document type
                file_ext = Path(file_path).suffix

+                # Apply proper encoding to file path
+                encoded_file_path = ensure_proper_encoding(file_path)
+
                # Add additional metadata
-                doc.metadata["file_path"] = file_path
+                doc.metadata["file_path"] = encoded_file_path
                doc.metadata["processed_at"] = datetime.now().isoformat()

                # Handle document-type-specific metadata
                if file_ext.lower() == '.pdf':
                    # PDF-specific metadata
-                    doc.metadata["page_label"] = doc.metadata.get("page_label", "unknown")
+                    doc.metadata["page_label"] = ensure_proper_encoding(doc.metadata.get("page_label", "unknown"))
                    doc.metadata["file_type"] = "pdf"

                elif file_ext.lower() in ['.docx', '.odt']:
                    # Word document metadata
-                    doc.metadata["section"] = doc.metadata.get("section", "unknown")
+                    doc.metadata["section"] = ensure_proper_encoding(doc.metadata.get("section", "unknown"))
                    doc.metadata["file_type"] = "document"

                elif file_ext.lower() == '.pptx':
                    # PowerPoint metadata
-                    doc.metadata["slide_id"] = doc.metadata.get("slide_id", "unknown")
+                    doc.metadata["slide_id"] = ensure_proper_encoding(doc.metadata.get("slide_id", "unknown"))
                    doc.metadata["file_type"] = "presentation"

                elif file_ext.lower() == '.xlsx':
                    # Excel metadata
-                    doc.metadata["sheet_name"] = doc.metadata.get("sheet_name", "unknown")
+                    doc.metadata["sheet_name"] = ensure_proper_encoding(doc.metadata.get("sheet_name", "unknown"))
                    doc.metadata["file_type"] = "spreadsheet"

                # Determine the appropriate text splitter based on file type
@@ -276,17 +325,21 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
                nodes_with_enhanced_metadata = []
                for i, node in enumerate(nodes):
                    # Enhance node metadata with additional information
-                    node.metadata["original_doc_id"] = doc.doc_id
+                    node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
                    node.metadata["chunk_number"] = i
                    node.metadata["total_chunks"] = len(nodes)
-                    node.metadata["file_path"] = file_path
+                    node.metadata["file_path"] = encoded_file_path
+
+                    # Ensure the text content is properly encoded
+                    node.text = ensure_proper_encoding(node.text)
+
                    nodes_with_enhanced_metadata.append(node)

                # Add all nodes to the index at once
                if nodes_with_enhanced_metadata:
                    index.insert_nodes(nodes_with_enhanced_metadata)

-                logger.info(f"Processed {len(nodes)} nodes from {file_path}")
+                logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")

            # Mark document as processed only after successful insertion
            tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})