Working enrichment
This commit is contained in:
1
services/rag/langchain/.gitignore
vendored
1
services/rag/langchain/.gitignore
vendored
@@ -214,3 +214,4 @@ __marimo__/
|
|||||||
|
|
||||||
# Streamlit
|
# Streamlit
|
||||||
.streamlit/secrets.toml
|
.streamlit/secrets.toml
|
||||||
|
document_tracking.db
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
import click
|
|
||||||
from loguru import logger
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import click
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
# Configure logging to output to both file and stdout as specified in requirements
|
# Configure logging to output to both file and stdout as specified in requirements
|
||||||
def setup_logging():
|
def setup_logging():
|
||||||
@@ -28,17 +29,24 @@ def ping():
|
|||||||
click.echo("pong")
|
click.echo("pong")
|
||||||
|
|
||||||
|
|
||||||
@cli.command(name="enrich", help="Load documents from data directory and store in vector database")
|
@cli.command(
|
||||||
@click.option('--data-dir', default="../../../data", help="Path to the data directory")
|
name="enrich",
|
||||||
@click.option('--collection-name', default="documents", help="Name of the vector store collection")
|
help="Load documents from data directory and store in vector database",
|
||||||
|
)
|
||||||
|
@click.option("--data-dir", default="../../../data", help="Path to the data directory")
|
||||||
|
@click.option(
|
||||||
|
"--collection-name",
|
||||||
|
default="documents_langchain",
|
||||||
|
help="Name of the vector store collection",
|
||||||
|
)
|
||||||
def enrich(data_dir, collection_name):
|
def enrich(data_dir, collection_name):
|
||||||
"""Load documents from data directory and store in vector database"""
|
"""Load documents from data directory and store in vector database"""
|
||||||
logger.info(f"Starting enrichment process for directory: {data_dir}")
|
logger.info(f"Starting enrichment process for directory: {data_dir}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Import here to avoid circular dependencies
|
# Import here to avoid circular dependencies
|
||||||
from vector_storage import initialize_vector_store
|
|
||||||
from enrichment import run_enrichment_process
|
from enrichment import run_enrichment_process
|
||||||
|
from vector_storage import initialize_vector_store
|
||||||
|
|
||||||
# Initialize vector store
|
# Initialize vector store
|
||||||
vector_store = initialize_vector_store(collection_name=collection_name)
|
vector_store = initialize_vector_store(collection_name=collection_name)
|
||||||
|
|||||||
@@ -6,14 +6,32 @@ from pathlib import Path
|
|||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
from langchain_community.document_loaders import (
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
PyPDFLoader,
|
# Dynamically import other loaders to handle optional dependencies
|
||||||
UnstructuredWordDocumentLoader,
|
try:
|
||||||
UnstructuredPowerPointLoader,
|
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
||||||
PandasExcelLoader,
|
except ImportError:
|
||||||
UnstructuredImageLoader,
|
UnstructuredWordDocumentLoader = None
|
||||||
UnstructuredODTLoader,
|
|
||||||
)
|
try:
|
||||||
|
from langchain_community.document_loaders import UnstructuredPowerPointLoader
|
||||||
|
except ImportError:
|
||||||
|
UnstructuredPowerPointLoader = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from langchain_community.document_loaders import UnstructuredExcelLoader
|
||||||
|
except ImportError:
|
||||||
|
UnstructuredExcelLoader = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from langchain_community.document_loaders import UnstructuredImageLoader
|
||||||
|
except ImportError:
|
||||||
|
UnstructuredImageLoader = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from langchain_community.document_loaders import UnstructuredODTLoader
|
||||||
|
except ImportError:
|
||||||
|
UnstructuredODTLoader = None
|
||||||
from sqlalchemy import create_engine, Column, Integer, String
|
from sqlalchemy import create_engine, Column, Integer, String
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
@@ -92,15 +110,31 @@ class DocumentEnricher:
|
|||||||
if ext == ".pdf":
|
if ext == ".pdf":
|
||||||
return PyPDFLoader(file_path)
|
return PyPDFLoader(file_path)
|
||||||
elif ext in [".docx", ".doc"]:
|
elif ext in [".docx", ".doc"]:
|
||||||
return UnstructuredWordDocumentLoader(file_path)
|
if UnstructuredWordDocumentLoader is None:
|
||||||
|
logger.warning(f"UnstructuredWordDocumentLoader not available for {file_path}. Skipping.")
|
||||||
|
return None
|
||||||
|
return UnstructuredWordDocumentLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
|
||||||
elif ext == ".pptx":
|
elif ext == ".pptx":
|
||||||
return UnstructuredPowerPointLoader(file_path)
|
if UnstructuredPowerPointLoader is None:
|
||||||
|
logger.warning(f"UnstructuredPowerPointLoader not available for {file_path}. Skipping.")
|
||||||
|
return None
|
||||||
|
return UnstructuredPowerPointLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
|
||||||
elif ext in [".xlsx", ".xls"]:
|
elif ext in [".xlsx", ".xls"]:
|
||||||
return PandasExcelLoader(file_path)
|
if UnstructuredExcelLoader is None:
|
||||||
|
logger.warning(f"UnstructuredExcelLoader not available for {file_path}. Skipping.")
|
||||||
|
return None
|
||||||
|
return UnstructuredExcelLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
|
||||||
elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
|
elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
|
||||||
return UnstructuredImageLoader(file_path)
|
if UnstructuredImageLoader is None:
|
||||||
|
logger.warning(f"UnstructuredImageLoader not available for {file_path}. Skipping.")
|
||||||
|
return None
|
||||||
|
# Use OCR strategy for images to extract text
|
||||||
|
return UnstructuredImageLoader(file_path, **{"strategy": "ocr_only", "languages": ["rus"]})
|
||||||
elif ext == ".odt":
|
elif ext == ".odt":
|
||||||
return UnstructuredODTLoader(file_path)
|
if UnstructuredODTLoader is None:
|
||||||
|
logger.warning(f"UnstructuredODTLoader not available for {file_path}. Skipping.")
|
||||||
|
return None
|
||||||
|
return UnstructuredODTLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
|
||||||
else:
|
else:
|
||||||
# For text files and unsupported formats, try to load as text
|
# For text files and unsupported formats, try to load as text
|
||||||
try:
|
try:
|
||||||
@@ -154,9 +188,6 @@ class DocumentEnricher:
|
|||||||
# Add to the collection
|
# Add to the collection
|
||||||
all_docs.extend(split_docs)
|
all_docs.extend(split_docs)
|
||||||
|
|
||||||
# Mark document as processed
|
|
||||||
self._mark_document_processed(file_path)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing {file_path}: {str(e)}")
|
logger.error(f"Error processing {file_path}: {str(e)}")
|
||||||
continue
|
continue
|
||||||
@@ -177,9 +208,22 @@ class DocumentEnricher:
|
|||||||
logger.info(f"Loaded and split {len(documents)} document chunks, adding to vector store...")
|
logger.info(f"Loaded and split {len(documents)} document chunks, adding to vector store...")
|
||||||
|
|
||||||
# Add documents to vector store
|
# Add documents to vector store
|
||||||
self.vector_store.add_documents(documents)
|
try:
|
||||||
|
self.vector_store.add_documents(documents)
|
||||||
|
|
||||||
logger.info(f"Successfully added {len(documents)} document chunks to vector store.")
|
# Only mark documents as processed after successful insertion to vector store
|
||||||
|
processed_file_paths = set()
|
||||||
|
for doc in documents:
|
||||||
|
if 'source' in doc.metadata:
|
||||||
|
processed_file_paths.add(doc.metadata['source'])
|
||||||
|
|
||||||
|
for file_path in processed_file_paths:
|
||||||
|
self._mark_document_processed(file_path)
|
||||||
|
|
||||||
|
logger.info(f"Successfully added {len(documents)} document chunks to vector store and marked {len(processed_file_paths)} files as processed.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error adding documents to vector store: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def get_all_documents_from_data_dir(data_dir: str = str(DATA_DIR)) -> List[str]:
|
def get_all_documents_from_data_dir(data_dir: str = str(DATA_DIR)) -> List[str]:
|
||||||
|
|||||||
@@ -46,26 +46,47 @@ def initialize_vector_store(
|
|||||||
base_url="http://localhost:11434", # Default Ollama URL
|
base_url="http://localhost:11434", # Default Ollama URL
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create or get the vector store
|
# Check if collection exists, if not create it
|
||||||
vector_store = Qdrant(
|
collection_exists = False
|
||||||
client=client,
|
try:
|
||||||
collection_name=collection_name,
|
client.get_collection(collection_name)
|
||||||
embeddings=embeddings,
|
collection_exists = True
|
||||||
)
|
except Exception:
|
||||||
|
# Collection doesn't exist, we'll create it
|
||||||
|
collection_exists = False
|
||||||
|
|
||||||
# If recreate_collection is True, we'll delete and recreate the collection
|
if recreate_collection and collection_exists:
|
||||||
if recreate_collection and collection_name in [
|
|
||||||
col.name for col in client.get_collections().collections
|
|
||||||
]:
|
|
||||||
client.delete_collection(collection_name)
|
client.delete_collection(collection_name)
|
||||||
|
collection_exists = False
|
||||||
|
|
||||||
# Recreate with proper configuration
|
# If collection doesn't exist, create it using the client directly
|
||||||
vector_store = Qdrant.from_documents(
|
if not collection_exists:
|
||||||
documents=[],
|
# Create collection using the Qdrant client directly
|
||||||
embedding=embeddings,
|
from qdrant_client.http.models import Distance, VectorParams
|
||||||
url=f"http://{QDRANT_HOST}:{QDRANT_REST_PORT}",
|
import numpy as np
|
||||||
|
|
||||||
|
# First, we need to determine the embedding size by creating a sample embedding
|
||||||
|
sample_embedding = embeddings.embed_query("sample text for dimension detection")
|
||||||
|
vector_size = len(sample_embedding)
|
||||||
|
|
||||||
|
# Create the collection with appropriate vector size
|
||||||
|
client.create_collection(
|
||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
force_recreate=True,
|
vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Now create the Qdrant instance connected to the newly created collection
|
||||||
|
vector_store = Qdrant(
|
||||||
|
client=client,
|
||||||
|
collection_name=collection_name,
|
||||||
|
embeddings=embeddings,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Collection exists, just connect to it
|
||||||
|
vector_store = Qdrant(
|
||||||
|
client=client,
|
||||||
|
collection_name=collection_name,
|
||||||
|
embeddings=embeddings,
|
||||||
)
|
)
|
||||||
|
|
||||||
return vector_store
|
return vector_store
|
||||||
@@ -116,7 +137,7 @@ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
|||||||
OPENROUTER_EMBEDDING_MODEL = os.getenv("OPENROUTER_EMBEDDING_MODEL", "openai/text-embedding-ada-002")
|
OPENROUTER_EMBEDDING_MODEL = os.getenv("OPENROUTER_EMBEDDING_MODEL", "openai/text-embedding-ada-002")
|
||||||
|
|
||||||
def initialize_vector_store_with_openrouter(
|
def initialize_vector_store_with_openrouter(
|
||||||
collection_name: str = "documents"
|
collection_name: str = "documents_langchain"
|
||||||
) -> Qdrant:
|
) -> Qdrant:
|
||||||
# Initialize Qdrant client
|
# Initialize Qdrant client
|
||||||
client = QdrantClient(
|
client = QdrantClient(
|
||||||
|
|||||||
Reference in New Issue
Block a user