ragflow in the repository, with codex-created yandex disk plugin JUST IN CASE, also llamaindex enrichment with yandex disk predefined data

2026-02-25 11:28:29 +03:00
parent c29928cc89
commit 2c7ab06b3f
12 changed files with 98507 additions and 132 deletions
--- a/services/rag/llamaindex/enrichment.py
+++ b/services/rag/llamaindex/enrichment.py
@@ -11,7 +11,7 @@ import os
 import sqlite3
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional

 from llama_index.core import Document, SimpleDirectoryReader
 from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
@@ -25,6 +25,35 @@ from config import get_embedding_model
 from vector_storage import get_vector_store_and_index


+SUPPORTED_ENRICHMENT_EXTENSIONS = {
+    ".csv",
+    ".doc",
+    ".docx",
+    ".epub",
+    ".htm",
+    ".html",
+    ".json",
+    ".jsonl",
+    ".md",
+    ".odt",
+    ".pdf",
+    ".ppt",
+    ".pptx",
+    ".rtf",
+    ".rst",
+    ".tsv",
+    ".txt",
+    ".xls",
+    ".xlsx",
+    ".xml",
+}
+
+
+def get_supported_enrichment_extensions() -> set[str]:
+    """Return the file extensions currently supported by enrichment."""
+    return set(SUPPORTED_ENRICHMENT_EXTENSIONS)
+
+
 class DocumentTracker:
    """Class to handle tracking of processed documents to avoid re-processing."""

@@ -251,24 +280,7 @@ def process_documents_from_data_folder(
        return

    # Find all supported files in the data directory
-    supported_extensions = {
-        ".pdf",
-        ".docx",
-        ".xlsx",
-        ".pptx",
-        ".odt",
-        ".txt",
-        ".png",
-        ".jpg",
-        ".jpeg",
-        ".gif",
-        ".bmp",
-        ".svg",
-        ".zip",
-        ".rar",
-        ".tar",
-        ".gz",
-    }
+    supported_extensions = get_supported_enrichment_extensions()

    # Walk through the directory structure
    all_files = []
@@ -285,10 +297,13 @@ def process_documents_from_data_folder(
                if file_ext in supported_extensions:
                    all_files.append(str(file))

-    logger.info(f"Found {len(all_files)} files to process")
+    logger.info(
+        f"Found {len(all_files)} supported files to process (extensions: {', '.join(sorted(supported_extensions))})"
+    )

    processed_count = 0
    skipped_count = 0
+    error_count = 0

    # Initialize progress bar
    pbar = tqdm(total=len(all_files), desc="Processing documents", unit="file")
@@ -298,113 +313,126 @@ def process_documents_from_data_folder(
            f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})"
        )

-        # Check if document has already been processed
-        if tracker.is_document_processed(file_path):
-            logger.info(f"Skipping already processed file: {file_path}")
-            skipped_count += 1
-            pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})
-            pbar.update(1)
-            continue
-
        try:
-            # Load the document using SimpleDirectoryReader
-            # This automatically selects the appropriate reader based on file extension
-            def file_metadata_func(file_path_str):
-                # Apply proper encoding to filename
-                filename = ensure_proper_encoding(Path(file_path_str).name)
-                return {"filename": filename}
-
-            reader = SimpleDirectoryReader(
-                input_files=[file_path], file_metadata=file_metadata_func
+            result = process_document_file(file_path, tracker=tracker, index=index)
+            if result["status"] == "processed":
+                processed_count += 1
+            elif result["status"] == "skipped":
+                skipped_count += 1
+            else:
+                error_count += 1
+            pbar.set_postfix(
+                {"Processed": processed_count, "Skipped": skipped_count, "Errors": error_count}
            )
-            documents = reader.load_data()
-
-            # Process each document
-            for doc in documents:
-                # Extract additional metadata based on document type
-                file_ext = Path(file_path).suffix
-
-                # Apply proper encoding to file path
-                encoded_file_path = ensure_proper_encoding(file_path)
-
-                # Add additional metadata
-                doc.metadata["file_path"] = encoded_file_path
-                doc.metadata["processed_at"] = datetime.now().isoformat()
-
-                # Handle document-type-specific metadata
-                if file_ext.lower() == ".pdf":
-                    # PDF-specific metadata
-                    doc.metadata["page_label"] = ensure_proper_encoding(
-                        doc.metadata.get("page_label", "unknown")
-                    )
-                    doc.metadata["file_type"] = "pdf"
-
-                elif file_ext.lower() in [".docx", ".odt"]:
-                    # Word document metadata
-                    doc.metadata["section"] = ensure_proper_encoding(
-                        doc.metadata.get("section", "unknown")
-                    )
-                    doc.metadata["file_type"] = "document"
-
-                elif file_ext.lower() == ".pptx":
-                    # PowerPoint metadata
-                    doc.metadata["slide_id"] = ensure_proper_encoding(
-                        doc.metadata.get("slide_id", "unknown")
-                    )
-                    doc.metadata["file_type"] = "presentation"
-
-                elif file_ext.lower() == ".xlsx":
-                    # Excel metadata
-                    doc.metadata["sheet_name"] = ensure_proper_encoding(
-                        doc.metadata.get("sheet_name", "unknown")
-                    )
-                    doc.metadata["file_type"] = "spreadsheet"
-
-                # Determine the appropriate text splitter based on file type
-                splitter = get_text_splitter(file_ext)
-
-                # Split the document into nodes
-                nodes = splitter.get_nodes_from_documents([doc])
-
-                # Insert nodes into the vector index
-                nodes_with_enhanced_metadata = []
-                for i, node in enumerate(nodes):
-                    # Enhance node metadata with additional information
-                    node.metadata["original_doc_id"] = ensure_proper_encoding(
-                        doc.doc_id
-                    )
-                    node.metadata["chunk_number"] = i
-                    node.metadata["total_chunks"] = len(nodes)
-                    node.metadata["file_path"] = encoded_file_path
-
-                    # Ensure the text content is properly encoded
-                    node.text = ensure_proper_encoding(node.text)
-
-                    nodes_with_enhanced_metadata.append(node)
-
-                # Add all nodes to the index at once
-                if nodes_with_enhanced_metadata:
-                    index.insert_nodes(nodes_with_enhanced_metadata)
-
-                logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")
-
-            # Mark document as processed only after successful insertion
-            tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})
-            processed_count += 1
-            pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})

        except Exception as e:
            logger.error(f"Error processing file {file_path}: {str(e)}")
+            error_count += 1
+            pbar.set_postfix(
+                {"Processed": processed_count, "Skipped": skipped_count, "Errors": error_count}
+            )

        # Update progress bar regardless of success or failure
        pbar.update(1)

    pbar.close()
    logger.info(
-        f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}"
+        f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}, Errors: {error_count}"
    )


+def process_document_file(
+    file_path: str,
+    tracker: Optional[DocumentTracker] = None,
+    index=None,
+) -> Dict[str, Any]:
+    """
+    Process a single document file and store its chunks in the vector index.
+
+    Returns a dict with status and counters. Status is one of:
+    `processed`, `skipped`, `error`.
+    """
+    file_ext = Path(file_path).suffix.lower()
+    if file_ext not in get_supported_enrichment_extensions():
+        logger.info(f"Skipping unsupported extension for file: {file_path}")
+        return {"status": "skipped", "reason": "unsupported_extension", "nodes": 0}
+
+    tracker = tracker or DocumentTracker()
+
+    if tracker.is_document_processed(file_path):
+        logger.info(f"Skipping already processed file: {file_path}")
+        return {"status": "skipped", "reason": "already_processed", "nodes": 0}
+
+    if index is None:
+        _, index = get_vector_store_and_index()
+
+    try:
+        def file_metadata_func(file_path_str):
+            filename = ensure_proper_encoding(Path(file_path_str).name)
+            return {"filename": filename}
+
+        reader = SimpleDirectoryReader(
+            input_files=[file_path], file_metadata=file_metadata_func
+        )
+        documents = reader.load_data()
+
+        total_nodes_inserted = 0
+        for doc in documents:
+            current_file_ext = Path(file_path).suffix
+            encoded_file_path = ensure_proper_encoding(file_path)
+
+            doc.metadata["file_path"] = encoded_file_path
+            doc.metadata["processed_at"] = datetime.now().isoformat()
+
+            if current_file_ext.lower() == ".pdf":
+                doc.metadata["page_label"] = ensure_proper_encoding(
+                    doc.metadata.get("page_label", "unknown")
+                )
+                doc.metadata["file_type"] = "pdf"
+            elif current_file_ext.lower() in [".docx", ".odt", ".doc", ".rtf"]:
+                doc.metadata["section"] = ensure_proper_encoding(
+                    doc.metadata.get("section", "unknown")
+                )
+                doc.metadata["file_type"] = "document"
+            elif current_file_ext.lower() in [".pptx", ".ppt"]:
+                doc.metadata["slide_id"] = ensure_proper_encoding(
+                    doc.metadata.get("slide_id", "unknown")
+                )
+                doc.metadata["file_type"] = "presentation"
+            elif current_file_ext.lower() in [".xlsx", ".xls", ".csv", ".tsv"]:
+                doc.metadata["sheet_name"] = ensure_proper_encoding(
+                    doc.metadata.get("sheet_name", "unknown")
+                )
+                doc.metadata["file_type"] = "spreadsheet"
+
+            splitter = get_text_splitter(current_file_ext)
+            nodes = splitter.get_nodes_from_documents([doc])
+
+            nodes_with_enhanced_metadata = []
+            for i, node in enumerate(nodes):
+                node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
+                node.metadata["chunk_number"] = i
+                node.metadata["total_chunks"] = len(nodes)
+                node.metadata["file_path"] = encoded_file_path
+                node.text = ensure_proper_encoding(node.text)
+                nodes_with_enhanced_metadata.append(node)
+
+            if nodes_with_enhanced_metadata:
+                index.insert_nodes(nodes_with_enhanced_metadata)
+                total_nodes_inserted += len(nodes_with_enhanced_metadata)
+
+            logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")
+
+        tracker.mark_document_processed(
+            file_path,
+            {"documents_count": len(documents), "nodes_count": total_nodes_inserted},
+        )
+        return {"status": "processed", "nodes": total_nodes_inserted}
+    except Exception as e:
+        logger.error(f"Error processing file {file_path}: {e}")
+        return {"status": "error", "reason": str(e), "nodes": 0}
+
+
 def enrich_documents():
    """Main function to run the document enrichment process."""
    logger.info("Starting document enrichment process")