ragflow in the repository, with codex-created yandex disk plugin JUST IN CASE, also llamaindex enrichment with yandex disk predefined data
This commit is contained in:
@@ -11,7 +11,7 @@ import os
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from llama_index.core import Document, SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
|
||||
@@ -25,6 +25,35 @@ from config import get_embedding_model
|
||||
from vector_storage import get_vector_store_and_index
|
||||
|
||||
|
||||
SUPPORTED_ENRICHMENT_EXTENSIONS = {
|
||||
".csv",
|
||||
".doc",
|
||||
".docx",
|
||||
".epub",
|
||||
".htm",
|
||||
".html",
|
||||
".json",
|
||||
".jsonl",
|
||||
".md",
|
||||
".odt",
|
||||
".pdf",
|
||||
".ppt",
|
||||
".pptx",
|
||||
".rtf",
|
||||
".rst",
|
||||
".tsv",
|
||||
".txt",
|
||||
".xls",
|
||||
".xlsx",
|
||||
".xml",
|
||||
}
|
||||
|
||||
|
||||
def get_supported_enrichment_extensions() -> set[str]:
|
||||
"""Return the file extensions currently supported by enrichment."""
|
||||
return set(SUPPORTED_ENRICHMENT_EXTENSIONS)
|
||||
|
||||
|
||||
class DocumentTracker:
|
||||
"""Class to handle tracking of processed documents to avoid re-processing."""
|
||||
|
||||
@@ -251,24 +280,7 @@ def process_documents_from_data_folder(
|
||||
return
|
||||
|
||||
# Find all supported files in the data directory
|
||||
supported_extensions = {
|
||||
".pdf",
|
||||
".docx",
|
||||
".xlsx",
|
||||
".pptx",
|
||||
".odt",
|
||||
".txt",
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".bmp",
|
||||
".svg",
|
||||
".zip",
|
||||
".rar",
|
||||
".tar",
|
||||
".gz",
|
||||
}
|
||||
supported_extensions = get_supported_enrichment_extensions()
|
||||
|
||||
# Walk through the directory structure
|
||||
all_files = []
|
||||
@@ -285,10 +297,13 @@ def process_documents_from_data_folder(
|
||||
if file_ext in supported_extensions:
|
||||
all_files.append(str(file))
|
||||
|
||||
logger.info(f"Found {len(all_files)} files to process")
|
||||
logger.info(
|
||||
f"Found {len(all_files)} supported files to process (extensions: {', '.join(sorted(supported_extensions))})"
|
||||
)
|
||||
|
||||
processed_count = 0
|
||||
skipped_count = 0
|
||||
error_count = 0
|
||||
|
||||
# Initialize progress bar
|
||||
pbar = tqdm(total=len(all_files), desc="Processing documents", unit="file")
|
||||
@@ -298,113 +313,126 @@ def process_documents_from_data_folder(
|
||||
f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})"
|
||||
)
|
||||
|
||||
# Check if document has already been processed
|
||||
if tracker.is_document_processed(file_path):
|
||||
logger.info(f"Skipping already processed file: {file_path}")
|
||||
skipped_count += 1
|
||||
pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})
|
||||
pbar.update(1)
|
||||
continue
|
||||
|
||||
try:
|
||||
# Load the document using SimpleDirectoryReader
|
||||
# This automatically selects the appropriate reader based on file extension
|
||||
def file_metadata_func(file_path_str):
|
||||
# Apply proper encoding to filename
|
||||
filename = ensure_proper_encoding(Path(file_path_str).name)
|
||||
return {"filename": filename}
|
||||
|
||||
reader = SimpleDirectoryReader(
|
||||
input_files=[file_path], file_metadata=file_metadata_func
|
||||
result = process_document_file(file_path, tracker=tracker, index=index)
|
||||
if result["status"] == "processed":
|
||||
processed_count += 1
|
||||
elif result["status"] == "skipped":
|
||||
skipped_count += 1
|
||||
else:
|
||||
error_count += 1
|
||||
pbar.set_postfix(
|
||||
{"Processed": processed_count, "Skipped": skipped_count, "Errors": error_count}
|
||||
)
|
||||
documents = reader.load_data()
|
||||
|
||||
# Process each document
|
||||
for doc in documents:
|
||||
# Extract additional metadata based on document type
|
||||
file_ext = Path(file_path).suffix
|
||||
|
||||
# Apply proper encoding to file path
|
||||
encoded_file_path = ensure_proper_encoding(file_path)
|
||||
|
||||
# Add additional metadata
|
||||
doc.metadata["file_path"] = encoded_file_path
|
||||
doc.metadata["processed_at"] = datetime.now().isoformat()
|
||||
|
||||
# Handle document-type-specific metadata
|
||||
if file_ext.lower() == ".pdf":
|
||||
# PDF-specific metadata
|
||||
doc.metadata["page_label"] = ensure_proper_encoding(
|
||||
doc.metadata.get("page_label", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "pdf"
|
||||
|
||||
elif file_ext.lower() in [".docx", ".odt"]:
|
||||
# Word document metadata
|
||||
doc.metadata["section"] = ensure_proper_encoding(
|
||||
doc.metadata.get("section", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "document"
|
||||
|
||||
elif file_ext.lower() == ".pptx":
|
||||
# PowerPoint metadata
|
||||
doc.metadata["slide_id"] = ensure_proper_encoding(
|
||||
doc.metadata.get("slide_id", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "presentation"
|
||||
|
||||
elif file_ext.lower() == ".xlsx":
|
||||
# Excel metadata
|
||||
doc.metadata["sheet_name"] = ensure_proper_encoding(
|
||||
doc.metadata.get("sheet_name", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "spreadsheet"
|
||||
|
||||
# Determine the appropriate text splitter based on file type
|
||||
splitter = get_text_splitter(file_ext)
|
||||
|
||||
# Split the document into nodes
|
||||
nodes = splitter.get_nodes_from_documents([doc])
|
||||
|
||||
# Insert nodes into the vector index
|
||||
nodes_with_enhanced_metadata = []
|
||||
for i, node in enumerate(nodes):
|
||||
# Enhance node metadata with additional information
|
||||
node.metadata["original_doc_id"] = ensure_proper_encoding(
|
||||
doc.doc_id
|
||||
)
|
||||
node.metadata["chunk_number"] = i
|
||||
node.metadata["total_chunks"] = len(nodes)
|
||||
node.metadata["file_path"] = encoded_file_path
|
||||
|
||||
# Ensure the text content is properly encoded
|
||||
node.text = ensure_proper_encoding(node.text)
|
||||
|
||||
nodes_with_enhanced_metadata.append(node)
|
||||
|
||||
# Add all nodes to the index at once
|
||||
if nodes_with_enhanced_metadata:
|
||||
index.insert_nodes(nodes_with_enhanced_metadata)
|
||||
|
||||
logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")
|
||||
|
||||
# Mark document as processed only after successful insertion
|
||||
tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})
|
||||
processed_count += 1
|
||||
pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {file_path}: {str(e)}")
|
||||
error_count += 1
|
||||
pbar.set_postfix(
|
||||
{"Processed": processed_count, "Skipped": skipped_count, "Errors": error_count}
|
||||
)
|
||||
|
||||
# Update progress bar regardless of success or failure
|
||||
pbar.update(1)
|
||||
|
||||
pbar.close()
|
||||
logger.info(
|
||||
f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}"
|
||||
f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}, Errors: {error_count}"
|
||||
)
|
||||
|
||||
|
||||
def process_document_file(
|
||||
file_path: str,
|
||||
tracker: Optional[DocumentTracker] = None,
|
||||
index=None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a single document file and store its chunks in the vector index.
|
||||
|
||||
Returns a dict with status and counters. Status is one of:
|
||||
`processed`, `skipped`, `error`.
|
||||
"""
|
||||
file_ext = Path(file_path).suffix.lower()
|
||||
if file_ext not in get_supported_enrichment_extensions():
|
||||
logger.info(f"Skipping unsupported extension for file: {file_path}")
|
||||
return {"status": "skipped", "reason": "unsupported_extension", "nodes": 0}
|
||||
|
||||
tracker = tracker or DocumentTracker()
|
||||
|
||||
if tracker.is_document_processed(file_path):
|
||||
logger.info(f"Skipping already processed file: {file_path}")
|
||||
return {"status": "skipped", "reason": "already_processed", "nodes": 0}
|
||||
|
||||
if index is None:
|
||||
_, index = get_vector_store_and_index()
|
||||
|
||||
try:
|
||||
def file_metadata_func(file_path_str):
|
||||
filename = ensure_proper_encoding(Path(file_path_str).name)
|
||||
return {"filename": filename}
|
||||
|
||||
reader = SimpleDirectoryReader(
|
||||
input_files=[file_path], file_metadata=file_metadata_func
|
||||
)
|
||||
documents = reader.load_data()
|
||||
|
||||
total_nodes_inserted = 0
|
||||
for doc in documents:
|
||||
current_file_ext = Path(file_path).suffix
|
||||
encoded_file_path = ensure_proper_encoding(file_path)
|
||||
|
||||
doc.metadata["file_path"] = encoded_file_path
|
||||
doc.metadata["processed_at"] = datetime.now().isoformat()
|
||||
|
||||
if current_file_ext.lower() == ".pdf":
|
||||
doc.metadata["page_label"] = ensure_proper_encoding(
|
||||
doc.metadata.get("page_label", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "pdf"
|
||||
elif current_file_ext.lower() in [".docx", ".odt", ".doc", ".rtf"]:
|
||||
doc.metadata["section"] = ensure_proper_encoding(
|
||||
doc.metadata.get("section", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "document"
|
||||
elif current_file_ext.lower() in [".pptx", ".ppt"]:
|
||||
doc.metadata["slide_id"] = ensure_proper_encoding(
|
||||
doc.metadata.get("slide_id", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "presentation"
|
||||
elif current_file_ext.lower() in [".xlsx", ".xls", ".csv", ".tsv"]:
|
||||
doc.metadata["sheet_name"] = ensure_proper_encoding(
|
||||
doc.metadata.get("sheet_name", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "spreadsheet"
|
||||
|
||||
splitter = get_text_splitter(current_file_ext)
|
||||
nodes = splitter.get_nodes_from_documents([doc])
|
||||
|
||||
nodes_with_enhanced_metadata = []
|
||||
for i, node in enumerate(nodes):
|
||||
node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
|
||||
node.metadata["chunk_number"] = i
|
||||
node.metadata["total_chunks"] = len(nodes)
|
||||
node.metadata["file_path"] = encoded_file_path
|
||||
node.text = ensure_proper_encoding(node.text)
|
||||
nodes_with_enhanced_metadata.append(node)
|
||||
|
||||
if nodes_with_enhanced_metadata:
|
||||
index.insert_nodes(nodes_with_enhanced_metadata)
|
||||
total_nodes_inserted += len(nodes_with_enhanced_metadata)
|
||||
|
||||
logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")
|
||||
|
||||
tracker.mark_document_processed(
|
||||
file_path,
|
||||
{"documents_count": len(documents), "nodes_count": total_nodes_inserted},
|
||||
)
|
||||
return {"status": "processed", "nodes": total_nodes_inserted}
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {file_path}: {e}")
|
||||
return {"status": "error", "reason": str(e), "nodes": 0}
|
||||
|
||||
|
||||
def enrich_documents():
|
||||
"""Main function to run the document enrichment process."""
|
||||
logger.info("Starting document enrichment process")
|
||||
|
||||
Reference in New Issue
Block a user