Retrieval and also update on russian language

This commit is contained in:
2026-02-04 16:51:50 +03:00
parent 3dea3605ad
commit ea4ce23cd9
7 changed files with 572 additions and 17 deletions

View File

@@ -158,6 +158,50 @@ def get_text_splitter(file_extension: str):
)
def ensure_proper_encoding(text):
"""
Helper function to ensure proper encoding of text, especially for non-ASCII characters like Cyrillic.
Args:
text: Text that may need encoding correction
Returns:
Properly encoded text string
"""
if text is None:
return "unknown"
if isinstance(text, bytes):
# Decode bytes to string with proper encoding
try:
return text.decode('utf-8')
except UnicodeDecodeError:
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
try:
return text.decode('cp1251') # Windows Cyrillic encoding
except UnicodeDecodeError:
try:
return text.decode('koi8-r') # Russian encoding
except UnicodeDecodeError:
# If all else fails, decode with errors='replace'
return text.decode('utf-8', errors='replace')
elif isinstance(text, str):
# Ensure the string is properly encoded
try:
# Try to encode and decode to ensure it's valid UTF-8
return text.encode('utf-8').decode('utf-8')
except UnicodeEncodeError:
# If there are encoding issues, try to fix them
return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
else:
# Convert other types to string and ensure proper encoding
text_str = str(text)
try:
return text_str.encode('utf-8').decode('utf-8')
except UnicodeEncodeError:
return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
def process_documents_from_data_folder(data_path: str = "../../../data", recursive: bool = True):
"""
Process all documents from the data folder using appropriate loaders and store in vector DB.
@@ -228,7 +272,9 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
# Load the document using SimpleDirectoryReader
# This automatically selects the appropriate reader based on file extension
def file_metadata_func(file_path_str):
return {"filename": Path(file_path_str).name}
# Apply proper encoding to filename
filename = ensure_proper_encoding(Path(file_path_str).name)
return {"filename": filename}
reader = SimpleDirectoryReader(
input_files=[file_path],
@@ -241,29 +287,32 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
# Extract additional metadata based on document type
file_ext = Path(file_path).suffix
# Apply proper encoding to file path
encoded_file_path = ensure_proper_encoding(file_path)
# Add additional metadata
doc.metadata["file_path"] = file_path
doc.metadata["file_path"] = encoded_file_path
doc.metadata["processed_at"] = datetime.now().isoformat()
# Handle document-type-specific metadata
if file_ext.lower() == '.pdf':
# PDF-specific metadata
doc.metadata["page_label"] = doc.metadata.get("page_label", "unknown")
doc.metadata["page_label"] = ensure_proper_encoding(doc.metadata.get("page_label", "unknown"))
doc.metadata["file_type"] = "pdf"
elif file_ext.lower() in ['.docx', '.odt']:
# Word document metadata
doc.metadata["section"] = doc.metadata.get("section", "unknown")
doc.metadata["section"] = ensure_proper_encoding(doc.metadata.get("section", "unknown"))
doc.metadata["file_type"] = "document"
elif file_ext.lower() == '.pptx':
# PowerPoint metadata
doc.metadata["slide_id"] = doc.metadata.get("slide_id", "unknown")
doc.metadata["slide_id"] = ensure_proper_encoding(doc.metadata.get("slide_id", "unknown"))
doc.metadata["file_type"] = "presentation"
elif file_ext.lower() == '.xlsx':
# Excel metadata
doc.metadata["sheet_name"] = doc.metadata.get("sheet_name", "unknown")
doc.metadata["sheet_name"] = ensure_proper_encoding(doc.metadata.get("sheet_name", "unknown"))
doc.metadata["file_type"] = "spreadsheet"
# Determine the appropriate text splitter based on file type
@@ -276,17 +325,21 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
nodes_with_enhanced_metadata = []
for i, node in enumerate(nodes):
# Enhance node metadata with additional information
node.metadata["original_doc_id"] = doc.doc_id
node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
node.metadata["chunk_number"] = i
node.metadata["total_chunks"] = len(nodes)
node.metadata["file_path"] = file_path
node.metadata["file_path"] = encoded_file_path
# Ensure the text content is properly encoded
node.text = ensure_proper_encoding(node.text)
nodes_with_enhanced_metadata.append(node)
# Add all nodes to the index at once
if nodes_with_enhanced_metadata:
index.insert_nodes(nodes_with_enhanced_metadata)
logger.info(f"Processed {len(nodes)} nodes from {file_path}")
logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")
# Mark document as processed only after successful insertion
tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})