Prefect client prep for langchain

This commit is contained in:
2026-02-16 15:12:44 +03:00
parent 93d538ecc6
commit 77c578c9e6
6 changed files with 148 additions and 94 deletions

View File

@@ -6,24 +6,24 @@ processing them with appropriate loaders, splitting them into chunks,
and storing them in the vector database with proper metadata.
"""
import os
import hashlib
from pathlib import Path
from typing import List, Dict, Any
from datetime import datetime
import os
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List
from llama_index.core import Document, SimpleDirectoryReader
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
from loguru import logger
from tqdm import tqdm
from llama_index.core import SimpleDirectoryReader, Document
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter
# Removed unused import
from vector_storage import get_vector_store_and_index
# Import the new configuration module
from config import get_embedding_model
# Removed unused import
from vector_storage import get_vector_store_and_index
class DocumentTracker:
"""Class to handle tracking of processed documents to avoid re-processing."""
@@ -38,7 +38,7 @@ class DocumentTracker:
cursor = conn.cursor()
# Create table for tracking processed documents
cursor.execute('''
cursor.execute("""
CREATE TABLE IF NOT EXISTS processed_documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
filename TEXT UNIQUE NOT NULL,
@@ -47,7 +47,7 @@ class DocumentTracker:
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
metadata_json TEXT
)
''')
""")
conn.commit()
conn.close()
@@ -63,7 +63,7 @@ class DocumentTracker:
cursor.execute(
"SELECT COUNT(*) FROM processed_documents WHERE filepath = ? AND checksum = ?",
(filepath, checksum)
(filepath, checksum),
)
count = cursor.fetchone()[0]
@@ -79,11 +79,14 @@ class DocumentTracker:
filename = Path(filepath).name
try:
cursor.execute('''
cursor.execute(
"""
INSERT OR REPLACE INTO processed_documents
(filename, filepath, checksum, processed_at, metadata_json)
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?)
''', (filename, filepath, checksum, str(metadata) if metadata else None))
""",
(filename, filepath, checksum, str(metadata) if metadata else None),
)
conn.commit()
logger.info(f"Document marked as processed: {filepath}")
@@ -104,62 +107,67 @@ class DocumentTracker:
def get_text_splitter(file_extension: str):
"""Get appropriate text splitter based on file type."""
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter, TokenTextSplitter
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.core.node_parser import (
CodeSplitter,
MarkdownElementNodeParser,
SentenceSplitter,
TokenTextSplitter,
)
# For code files, use CodeSplitter
if file_extension.lower() in ['.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.go', '.rs', '.php', '.html', '.css', '.md', '.rst']:
if file_extension.lower() in [
".py",
".js",
".ts",
".java",
".cpp",
".c",
".h",
".cs",
".go",
".rs",
".php",
".html",
".css",
".md",
".rst",
]:
return CodeSplitter(language="python", max_chars=1000)
# For PDF files, use a parser that can handle multi-page documents
elif file_extension.lower() == '.pdf':
elif file_extension.lower() == ".pdf":
return SentenceSplitter(
chunk_size=512, # Smaller chunks for dense PDF content
chunk_overlap=100
chunk_overlap=100,
)
# For presentation files (PowerPoint), use smaller chunks
elif file_extension.lower() == '.pptx':
elif file_extension.lower() == ".pptx":
return SentenceSplitter(
chunk_size=256, # Slides typically have less text
chunk_overlap=50
chunk_overlap=50,
)
# For spreadsheets, use smaller chunks
elif file_extension.lower() == '.xlsx':
return SentenceSplitter(
chunk_size=256,
chunk_overlap=50
)
elif file_extension.lower() == ".xlsx":
return SentenceSplitter(chunk_size=256, chunk_overlap=50)
# For text-heavy documents like Word, use medium-sized chunks
elif file_extension.lower() in ['.docx', '.odt']:
return SentenceSplitter(
chunk_size=768,
chunk_overlap=150
)
elif file_extension.lower() in [".docx", ".odt"]:
return SentenceSplitter(chunk_size=768, chunk_overlap=150)
# For plain text files, use larger chunks
elif file_extension.lower() == '.txt':
return SentenceSplitter(
chunk_size=1024,
chunk_overlap=200
)
elif file_extension.lower() == ".txt":
return SentenceSplitter(chunk_size=1024, chunk_overlap=200)
# For image files, we'll handle them differently (metadata extraction)
elif file_extension.lower() in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg']:
elif file_extension.lower() in [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg"]:
# Images will be handled by multimodal models, return a simple splitter
return SentenceSplitter(
chunk_size=512,
chunk_overlap=100
)
return SentenceSplitter(chunk_size=512, chunk_overlap=100)
# For other files, use a standard SentenceSplitter
else:
return SentenceSplitter(
chunk_size=768,
chunk_overlap=150
)
return SentenceSplitter(chunk_size=768, chunk_overlap=150)
def ensure_proper_encoding(text):
@@ -178,35 +186,41 @@ def ensure_proper_encoding(text):
if isinstance(text, bytes):
# Decode bytes to string with proper encoding
try:
return text.decode('utf-8')
return text.decode("utf-8")
except UnicodeDecodeError:
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
try:
return text.decode('cp1251') # Windows Cyrillic encoding
return text.decode("cp1251") # Windows Cyrillic encoding
except UnicodeDecodeError:
try:
return text.decode('koi8-r') # Russian encoding
return text.decode("koi8-r") # Russian encoding
except UnicodeDecodeError:
# If all else fails, decode with errors='replace'
return text.decode('utf-8', errors='replace')
return text.decode("utf-8", errors="replace")
elif isinstance(text, str):
# Ensure the string is properly encoded
try:
# Try to encode and decode to ensure it's valid UTF-8
return text.encode('utf-8').decode('utf-8')
return text.encode("utf-8").decode("utf-8")
except UnicodeEncodeError:
# If there are encoding issues, try to fix them
return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
return text.encode("utf-8", errors="replace").decode(
"utf-8", errors="replace"
)
else:
# Convert other types to string and ensure proper encoding
text_str = str(text)
try:
return text_str.encode('utf-8').decode('utf-8')
return text_str.encode("utf-8").decode("utf-8")
except UnicodeEncodeError:
return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
return text_str.encode("utf-8", errors="replace").decode(
"utf-8", errors="replace"
)
def process_documents_from_data_folder(data_path: str = "../../../data", recursive: bool = True):
def process_documents_from_data_folder(
data_path: str = "../../../data", recursive: bool = True
):
"""
Process all documents from the data folder using appropriate loaders and store in vector DB.
@@ -238,9 +252,22 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
# Find all supported files in the data directory
supported_extensions = {
'.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.txt',
'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg',
'.zip', '.rar', '.tar', '.gz'
".pdf",
".docx",
".xlsx",
".pptx",
".odt",
".txt",
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".svg",
".zip",
".rar",
".tar",
".gz",
}
# Walk through the directory structure
@@ -265,9 +292,11 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
# Initialize progress bar
pbar = tqdm(total=len(all_files), desc="Processing documents", unit="file")
for file_path in all_files:
logger.info(f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})")
logger.info(
f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})"
)
# Check if document has already been processed
if tracker.is_document_processed(file_path):
@@ -286,8 +315,7 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
return {"filename": filename}
reader = SimpleDirectoryReader(
input_files=[file_path],
file_metadata=file_metadata_func
input_files=[file_path], file_metadata=file_metadata_func
)
documents = reader.load_data()
@@ -304,24 +332,32 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
doc.metadata["processed_at"] = datetime.now().isoformat()
# Handle document-type-specific metadata
if file_ext.lower() == '.pdf':
if file_ext.lower() == ".pdf":
# PDF-specific metadata
doc.metadata["page_label"] = ensure_proper_encoding(doc.metadata.get("page_label", "unknown"))
doc.metadata["page_label"] = ensure_proper_encoding(
doc.metadata.get("page_label", "unknown")
)
doc.metadata["file_type"] = "pdf"
elif file_ext.lower() in ['.docx', '.odt']:
elif file_ext.lower() in [".docx", ".odt"]:
# Word document metadata
doc.metadata["section"] = ensure_proper_encoding(doc.metadata.get("section", "unknown"))
doc.metadata["section"] = ensure_proper_encoding(
doc.metadata.get("section", "unknown")
)
doc.metadata["file_type"] = "document"
elif file_ext.lower() == '.pptx':
elif file_ext.lower() == ".pptx":
# PowerPoint metadata
doc.metadata["slide_id"] = ensure_proper_encoding(doc.metadata.get("slide_id", "unknown"))
doc.metadata["slide_id"] = ensure_proper_encoding(
doc.metadata.get("slide_id", "unknown")
)
doc.metadata["file_type"] = "presentation"
elif file_ext.lower() == '.xlsx':
elif file_ext.lower() == ".xlsx":
# Excel metadata
doc.metadata["sheet_name"] = ensure_proper_encoding(doc.metadata.get("sheet_name", "unknown"))
doc.metadata["sheet_name"] = ensure_proper_encoding(
doc.metadata.get("sheet_name", "unknown")
)
doc.metadata["file_type"] = "spreadsheet"
# Determine the appropriate text splitter based on file type
@@ -334,7 +370,9 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
nodes_with_enhanced_metadata = []
for i, node in enumerate(nodes):
# Enhance node metadata with additional information
node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
node.metadata["original_doc_id"] = ensure_proper_encoding(
doc.doc_id
)
node.metadata["chunk_number"] = i
node.metadata["total_chunks"] = len(nodes)
node.metadata["file_path"] = encoded_file_path
@@ -357,12 +395,14 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
except Exception as e:
logger.error(f"Error processing file {file_path}: {str(e)}")
# Update progress bar regardless of success or failure
pbar.update(1)
pbar.close()
logger.info(f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}")
logger.info(
f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}"
)
def enrich_documents():