Prefect client prep for langchain
This commit is contained in:
@@ -6,24 +6,24 @@ processing them with appropriate loaders, splitting them into chunks,
|
||||
and storing them in the vector database with proper metadata.
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
import os
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from llama_index.core import Document, SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
|
||||
from llama_index.core import SimpleDirectoryReader, Document
|
||||
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter
|
||||
# Removed unused import
|
||||
|
||||
from vector_storage import get_vector_store_and_index
|
||||
|
||||
# Import the new configuration module
|
||||
from config import get_embedding_model
|
||||
|
||||
# Removed unused import
|
||||
from vector_storage import get_vector_store_and_index
|
||||
|
||||
|
||||
class DocumentTracker:
|
||||
"""Class to handle tracking of processed documents to avoid re-processing."""
|
||||
@@ -38,7 +38,7 @@ class DocumentTracker:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Create table for tracking processed documents
|
||||
cursor.execute('''
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS processed_documents (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
filename TEXT UNIQUE NOT NULL,
|
||||
@@ -47,7 +47,7 @@ class DocumentTracker:
|
||||
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
metadata_json TEXT
|
||||
)
|
||||
''')
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
@@ -63,7 +63,7 @@ class DocumentTracker:
|
||||
|
||||
cursor.execute(
|
||||
"SELECT COUNT(*) FROM processed_documents WHERE filepath = ? AND checksum = ?",
|
||||
(filepath, checksum)
|
||||
(filepath, checksum),
|
||||
)
|
||||
count = cursor.fetchone()[0]
|
||||
|
||||
@@ -79,11 +79,14 @@ class DocumentTracker:
|
||||
filename = Path(filepath).name
|
||||
|
||||
try:
|
||||
cursor.execute('''
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO processed_documents
|
||||
(filename, filepath, checksum, processed_at, metadata_json)
|
||||
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?)
|
||||
''', (filename, filepath, checksum, str(metadata) if metadata else None))
|
||||
""",
|
||||
(filename, filepath, checksum, str(metadata) if metadata else None),
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
logger.info(f"Document marked as processed: {filepath}")
|
||||
@@ -104,62 +107,67 @@ class DocumentTracker:
|
||||
|
||||
def get_text_splitter(file_extension: str):
|
||||
"""Get appropriate text splitter based on file type."""
|
||||
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter, TokenTextSplitter
|
||||
from llama_index.core.node_parser import MarkdownElementNodeParser
|
||||
from llama_index.core.node_parser import (
|
||||
CodeSplitter,
|
||||
MarkdownElementNodeParser,
|
||||
SentenceSplitter,
|
||||
TokenTextSplitter,
|
||||
)
|
||||
|
||||
# For code files, use CodeSplitter
|
||||
if file_extension.lower() in ['.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.go', '.rs', '.php', '.html', '.css', '.md', '.rst']:
|
||||
if file_extension.lower() in [
|
||||
".py",
|
||||
".js",
|
||||
".ts",
|
||||
".java",
|
||||
".cpp",
|
||||
".c",
|
||||
".h",
|
||||
".cs",
|
||||
".go",
|
||||
".rs",
|
||||
".php",
|
||||
".html",
|
||||
".css",
|
||||
".md",
|
||||
".rst",
|
||||
]:
|
||||
return CodeSplitter(language="python", max_chars=1000)
|
||||
|
||||
# For PDF files, use a parser that can handle multi-page documents
|
||||
elif file_extension.lower() == '.pdf':
|
||||
elif file_extension.lower() == ".pdf":
|
||||
return SentenceSplitter(
|
||||
chunk_size=512, # Smaller chunks for dense PDF content
|
||||
chunk_overlap=100
|
||||
chunk_overlap=100,
|
||||
)
|
||||
|
||||
# For presentation files (PowerPoint), use smaller chunks
|
||||
elif file_extension.lower() == '.pptx':
|
||||
elif file_extension.lower() == ".pptx":
|
||||
return SentenceSplitter(
|
||||
chunk_size=256, # Slides typically have less text
|
||||
chunk_overlap=50
|
||||
chunk_overlap=50,
|
||||
)
|
||||
|
||||
# For spreadsheets, use smaller chunks
|
||||
elif file_extension.lower() == '.xlsx':
|
||||
return SentenceSplitter(
|
||||
chunk_size=256,
|
||||
chunk_overlap=50
|
||||
)
|
||||
elif file_extension.lower() == ".xlsx":
|
||||
return SentenceSplitter(chunk_size=256, chunk_overlap=50)
|
||||
|
||||
# For text-heavy documents like Word, use medium-sized chunks
|
||||
elif file_extension.lower() in ['.docx', '.odt']:
|
||||
return SentenceSplitter(
|
||||
chunk_size=768,
|
||||
chunk_overlap=150
|
||||
)
|
||||
elif file_extension.lower() in [".docx", ".odt"]:
|
||||
return SentenceSplitter(chunk_size=768, chunk_overlap=150)
|
||||
|
||||
# For plain text files, use larger chunks
|
||||
elif file_extension.lower() == '.txt':
|
||||
return SentenceSplitter(
|
||||
chunk_size=1024,
|
||||
chunk_overlap=200
|
||||
)
|
||||
elif file_extension.lower() == ".txt":
|
||||
return SentenceSplitter(chunk_size=1024, chunk_overlap=200)
|
||||
|
||||
# For image files, we'll handle them differently (metadata extraction)
|
||||
elif file_extension.lower() in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg']:
|
||||
elif file_extension.lower() in [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg"]:
|
||||
# Images will be handled by multimodal models, return a simple splitter
|
||||
return SentenceSplitter(
|
||||
chunk_size=512,
|
||||
chunk_overlap=100
|
||||
)
|
||||
return SentenceSplitter(chunk_size=512, chunk_overlap=100)
|
||||
|
||||
# For other files, use a standard SentenceSplitter
|
||||
else:
|
||||
return SentenceSplitter(
|
||||
chunk_size=768,
|
||||
chunk_overlap=150
|
||||
)
|
||||
return SentenceSplitter(chunk_size=768, chunk_overlap=150)
|
||||
|
||||
|
||||
def ensure_proper_encoding(text):
|
||||
@@ -178,35 +186,41 @@ def ensure_proper_encoding(text):
|
||||
if isinstance(text, bytes):
|
||||
# Decode bytes to string with proper encoding
|
||||
try:
|
||||
return text.decode('utf-8')
|
||||
return text.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
|
||||
try:
|
||||
return text.decode('cp1251') # Windows Cyrillic encoding
|
||||
return text.decode("cp1251") # Windows Cyrillic encoding
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
return text.decode('koi8-r') # Russian encoding
|
||||
return text.decode("koi8-r") # Russian encoding
|
||||
except UnicodeDecodeError:
|
||||
# If all else fails, decode with errors='replace'
|
||||
return text.decode('utf-8', errors='replace')
|
||||
return text.decode("utf-8", errors="replace")
|
||||
elif isinstance(text, str):
|
||||
# Ensure the string is properly encoded
|
||||
try:
|
||||
# Try to encode and decode to ensure it's valid UTF-8
|
||||
return text.encode('utf-8').decode('utf-8')
|
||||
return text.encode("utf-8").decode("utf-8")
|
||||
except UnicodeEncodeError:
|
||||
# If there are encoding issues, try to fix them
|
||||
return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
||||
return text.encode("utf-8", errors="replace").decode(
|
||||
"utf-8", errors="replace"
|
||||
)
|
||||
else:
|
||||
# Convert other types to string and ensure proper encoding
|
||||
text_str = str(text)
|
||||
try:
|
||||
return text_str.encode('utf-8').decode('utf-8')
|
||||
return text_str.encode("utf-8").decode("utf-8")
|
||||
except UnicodeEncodeError:
|
||||
return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
||||
return text_str.encode("utf-8", errors="replace").decode(
|
||||
"utf-8", errors="replace"
|
||||
)
|
||||
|
||||
|
||||
def process_documents_from_data_folder(data_path: str = "../../../data", recursive: bool = True):
|
||||
def process_documents_from_data_folder(
|
||||
data_path: str = "../../../data", recursive: bool = True
|
||||
):
|
||||
"""
|
||||
Process all documents from the data folder using appropriate loaders and store in vector DB.
|
||||
|
||||
@@ -238,9 +252,22 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
|
||||
# Find all supported files in the data directory
|
||||
supported_extensions = {
|
||||
'.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.txt',
|
||||
'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg',
|
||||
'.zip', '.rar', '.tar', '.gz'
|
||||
".pdf",
|
||||
".docx",
|
||||
".xlsx",
|
||||
".pptx",
|
||||
".odt",
|
||||
".txt",
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".bmp",
|
||||
".svg",
|
||||
".zip",
|
||||
".rar",
|
||||
".tar",
|
||||
".gz",
|
||||
}
|
||||
|
||||
# Walk through the directory structure
|
||||
@@ -265,9 +292,11 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
|
||||
# Initialize progress bar
|
||||
pbar = tqdm(total=len(all_files), desc="Processing documents", unit="file")
|
||||
|
||||
|
||||
for file_path in all_files:
|
||||
logger.info(f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})")
|
||||
logger.info(
|
||||
f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})"
|
||||
)
|
||||
|
||||
# Check if document has already been processed
|
||||
if tracker.is_document_processed(file_path):
|
||||
@@ -286,8 +315,7 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
return {"filename": filename}
|
||||
|
||||
reader = SimpleDirectoryReader(
|
||||
input_files=[file_path],
|
||||
file_metadata=file_metadata_func
|
||||
input_files=[file_path], file_metadata=file_metadata_func
|
||||
)
|
||||
documents = reader.load_data()
|
||||
|
||||
@@ -304,24 +332,32 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
doc.metadata["processed_at"] = datetime.now().isoformat()
|
||||
|
||||
# Handle document-type-specific metadata
|
||||
if file_ext.lower() == '.pdf':
|
||||
if file_ext.lower() == ".pdf":
|
||||
# PDF-specific metadata
|
||||
doc.metadata["page_label"] = ensure_proper_encoding(doc.metadata.get("page_label", "unknown"))
|
||||
doc.metadata["page_label"] = ensure_proper_encoding(
|
||||
doc.metadata.get("page_label", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "pdf"
|
||||
|
||||
elif file_ext.lower() in ['.docx', '.odt']:
|
||||
elif file_ext.lower() in [".docx", ".odt"]:
|
||||
# Word document metadata
|
||||
doc.metadata["section"] = ensure_proper_encoding(doc.metadata.get("section", "unknown"))
|
||||
doc.metadata["section"] = ensure_proper_encoding(
|
||||
doc.metadata.get("section", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "document"
|
||||
|
||||
elif file_ext.lower() == '.pptx':
|
||||
elif file_ext.lower() == ".pptx":
|
||||
# PowerPoint metadata
|
||||
doc.metadata["slide_id"] = ensure_proper_encoding(doc.metadata.get("slide_id", "unknown"))
|
||||
doc.metadata["slide_id"] = ensure_proper_encoding(
|
||||
doc.metadata.get("slide_id", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "presentation"
|
||||
|
||||
elif file_ext.lower() == '.xlsx':
|
||||
elif file_ext.lower() == ".xlsx":
|
||||
# Excel metadata
|
||||
doc.metadata["sheet_name"] = ensure_proper_encoding(doc.metadata.get("sheet_name", "unknown"))
|
||||
doc.metadata["sheet_name"] = ensure_proper_encoding(
|
||||
doc.metadata.get("sheet_name", "unknown")
|
||||
)
|
||||
doc.metadata["file_type"] = "spreadsheet"
|
||||
|
||||
# Determine the appropriate text splitter based on file type
|
||||
@@ -334,7 +370,9 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
nodes_with_enhanced_metadata = []
|
||||
for i, node in enumerate(nodes):
|
||||
# Enhance node metadata with additional information
|
||||
node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
|
||||
node.metadata["original_doc_id"] = ensure_proper_encoding(
|
||||
doc.doc_id
|
||||
)
|
||||
node.metadata["chunk_number"] = i
|
||||
node.metadata["total_chunks"] = len(nodes)
|
||||
node.metadata["file_path"] = encoded_file_path
|
||||
@@ -357,12 +395,14 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {file_path}: {str(e)}")
|
||||
|
||||
|
||||
# Update progress bar regardless of success or failure
|
||||
pbar.update(1)
|
||||
|
||||
pbar.close()
|
||||
logger.info(f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}")
|
||||
logger.info(
|
||||
f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}"
|
||||
)
|
||||
|
||||
|
||||
def enrich_documents():
|
||||
|
||||
Reference in New Issue
Block a user