Retrieval and also update on russian language
This commit is contained in:
@@ -158,6 +158,50 @@ def get_text_splitter(file_extension: str):
|
||||
)
|
||||
|
||||
|
||||
def ensure_proper_encoding(text):
|
||||
"""
|
||||
Helper function to ensure proper encoding of text, especially for non-ASCII characters like Cyrillic.
|
||||
|
||||
Args:
|
||||
text: Text that may need encoding correction
|
||||
|
||||
Returns:
|
||||
Properly encoded text string
|
||||
"""
|
||||
if text is None:
|
||||
return "unknown"
|
||||
|
||||
if isinstance(text, bytes):
|
||||
# Decode bytes to string with proper encoding
|
||||
try:
|
||||
return text.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
|
||||
try:
|
||||
return text.decode('cp1251') # Windows Cyrillic encoding
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
return text.decode('koi8-r') # Russian encoding
|
||||
except UnicodeDecodeError:
|
||||
# If all else fails, decode with errors='replace'
|
||||
return text.decode('utf-8', errors='replace')
|
||||
elif isinstance(text, str):
|
||||
# Ensure the string is properly encoded
|
||||
try:
|
||||
# Try to encode and decode to ensure it's valid UTF-8
|
||||
return text.encode('utf-8').decode('utf-8')
|
||||
except UnicodeEncodeError:
|
||||
# If there are encoding issues, try to fix them
|
||||
return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
||||
else:
|
||||
# Convert other types to string and ensure proper encoding
|
||||
text_str = str(text)
|
||||
try:
|
||||
return text_str.encode('utf-8').decode('utf-8')
|
||||
except UnicodeEncodeError:
|
||||
return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
def process_documents_from_data_folder(data_path: str = "../../../data", recursive: bool = True):
|
||||
"""
|
||||
Process all documents from the data folder using appropriate loaders and store in vector DB.
|
||||
@@ -228,7 +272,9 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
# Load the document using SimpleDirectoryReader
|
||||
# This automatically selects the appropriate reader based on file extension
|
||||
def file_metadata_func(file_path_str):
|
||||
return {"filename": Path(file_path_str).name}
|
||||
# Apply proper encoding to filename
|
||||
filename = ensure_proper_encoding(Path(file_path_str).name)
|
||||
return {"filename": filename}
|
||||
|
||||
reader = SimpleDirectoryReader(
|
||||
input_files=[file_path],
|
||||
@@ -241,29 +287,32 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
# Extract additional metadata based on document type
|
||||
file_ext = Path(file_path).suffix
|
||||
|
||||
# Apply proper encoding to file path
|
||||
encoded_file_path = ensure_proper_encoding(file_path)
|
||||
|
||||
# Add additional metadata
|
||||
doc.metadata["file_path"] = file_path
|
||||
doc.metadata["file_path"] = encoded_file_path
|
||||
doc.metadata["processed_at"] = datetime.now().isoformat()
|
||||
|
||||
# Handle document-type-specific metadata
|
||||
if file_ext.lower() == '.pdf':
|
||||
# PDF-specific metadata
|
||||
doc.metadata["page_label"] = doc.metadata.get("page_label", "unknown")
|
||||
doc.metadata["page_label"] = ensure_proper_encoding(doc.metadata.get("page_label", "unknown"))
|
||||
doc.metadata["file_type"] = "pdf"
|
||||
|
||||
elif file_ext.lower() in ['.docx', '.odt']:
|
||||
# Word document metadata
|
||||
doc.metadata["section"] = doc.metadata.get("section", "unknown")
|
||||
doc.metadata["section"] = ensure_proper_encoding(doc.metadata.get("section", "unknown"))
|
||||
doc.metadata["file_type"] = "document"
|
||||
|
||||
elif file_ext.lower() == '.pptx':
|
||||
# PowerPoint metadata
|
||||
doc.metadata["slide_id"] = doc.metadata.get("slide_id", "unknown")
|
||||
doc.metadata["slide_id"] = ensure_proper_encoding(doc.metadata.get("slide_id", "unknown"))
|
||||
doc.metadata["file_type"] = "presentation"
|
||||
|
||||
elif file_ext.lower() == '.xlsx':
|
||||
# Excel metadata
|
||||
doc.metadata["sheet_name"] = doc.metadata.get("sheet_name", "unknown")
|
||||
doc.metadata["sheet_name"] = ensure_proper_encoding(doc.metadata.get("sheet_name", "unknown"))
|
||||
doc.metadata["file_type"] = "spreadsheet"
|
||||
|
||||
# Determine the appropriate text splitter based on file type
|
||||
@@ -276,17 +325,21 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
nodes_with_enhanced_metadata = []
|
||||
for i, node in enumerate(nodes):
|
||||
# Enhance node metadata with additional information
|
||||
node.metadata["original_doc_id"] = doc.doc_id
|
||||
node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
|
||||
node.metadata["chunk_number"] = i
|
||||
node.metadata["total_chunks"] = len(nodes)
|
||||
node.metadata["file_path"] = file_path
|
||||
node.metadata["file_path"] = encoded_file_path
|
||||
|
||||
# Ensure the text content is properly encoded
|
||||
node.text = ensure_proper_encoding(node.text)
|
||||
|
||||
nodes_with_enhanced_metadata.append(node)
|
||||
|
||||
# Add all nodes to the index at once
|
||||
if nodes_with_enhanced_metadata:
|
||||
index.insert_nodes(nodes_with_enhanced_metadata)
|
||||
|
||||
logger.info(f"Processed {len(nodes)} nodes from {file_path}")
|
||||
logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")
|
||||
|
||||
# Mark document as processed only after successful insertion
|
||||
tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})
|
||||
|
||||
Reference in New Issue
Block a user