From 5721bad117103fc2e5a47ddebba40f3ae488dc67 Mon Sep 17 00:00:00 2001 From: idchlife Date: Mon, 9 Mar 2026 10:21:39 +0300 Subject: [PATCH] fix for openai compatibility model with needed parameters --- services/rag/llamaindex/.env.dist | 3 +++ services/rag/llamaindex/chat_engine.py | 14 +++++++++++++ services/rag/llamaindex/config.py | 21 +++++++++++++++---- .../helpers/openai_compatible_llm.py | 5 +++++ 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/services/rag/llamaindex/.env.dist b/services/rag/llamaindex/.env.dist index d6a3dd5..657f4e8 100644 --- a/services/rag/llamaindex/.env.dist +++ b/services/rag/llamaindex/.env.dist @@ -14,6 +14,9 @@ QDRANT_GRPC_PORT=6334 # OpenAI Configuration (for reference - uncomment and configure when using OpenAI strategy) # OPENAI_CHAT_URL=https://api.openai.com/v1 # OPENAI_CHAT_KEY=your_openai_api_key_here +# OPENAI_CHAT_TEMPERATURE=0.1 +# OPENAI_CHAT_MAX_TOKENS=1024 +# OPENAI_CHAT_REASONING_EFFORT=low # OPENAI_CHAT_IS_FUNCTION_CALLING_MODEL=false # OPENAI_EMBEDDING_MODEL=text-embedding-3-small # OPENAI_EMBEDDING_BASE_URL=https://api.openai.com/v1 diff --git a/services/rag/llamaindex/chat_engine.py b/services/rag/llamaindex/chat_engine.py index 162ad70..b6c879b 100644 --- a/services/rag/llamaindex/chat_engine.py +++ b/services/rag/llamaindex/chat_engine.py @@ -17,6 +17,7 @@ from typing import Any, Iterable, List from llama_index.core import PromptTemplate from llama_index.core.agent import AgentWorkflow +from llama_index.core.base.llms.types import ChatMessage, MessageRole from llama_index.core.retrievers import VectorIndexRetriever from llama_index.core.schema import NodeWithScore from llama_index.core.tools import FunctionTool @@ -230,6 +231,19 @@ def synthesize_answer(query: str, sources: list[dict[str, Any]], agent_draft: st context_json=context_json, ) logger.info("Synthesizing grounded answer from retrieved sources") + # Prefer chat API for chat-capable models; fallback to completion if unavailable. + try: + if hasattr(llm, "chat"): + chat_response = llm.chat( + [ + ChatMessage(role=MessageRole.SYSTEM, content="You answer with grounded citations only."), + ChatMessage(role=MessageRole.USER, content=prompt), + ] + ) + return _normalize_text(getattr(chat_response, "message", chat_response).content) + except Exception as e: + logger.warning(f"LLM chat synthesis failed, falling back to completion: {e}") + response = llm.complete(prompt) return _normalize_text(getattr(response, "text", response)) diff --git a/services/rag/llamaindex/config.py b/services/rag/llamaindex/config.py index e948e90..b2a88ba 100644 --- a/services/rag/llamaindex/config.py +++ b/services/rag/llamaindex/config.py @@ -93,12 +93,19 @@ def get_llm_model(): return llm elif strategy == "openai": - from llama_index.llms.openai_like import OpenAILike - # from helpers.openai_compatible_llm import OpenAICompatibleLLM + from helpers.openai_compatible_llm import OpenAICompatibleLLM openai_chat_url = os.getenv("OPENAI_CHAT_URL", "https://api.openai.com/v1") openai_chat_key = os.getenv("OPENAI_CHAT_KEY", "dummy_key_for_template") openai_chat_model = os.getenv("OPENAI_CHAT_MODEL", "gpt-3.5-turbo") + openai_chat_temperature = float(os.getenv("OPENAI_CHAT_TEMPERATURE", "0.1")) + openai_chat_max_tokens_env = os.getenv("OPENAI_CHAT_MAX_TOKENS", "").strip() + openai_chat_max_tokens = ( + int(openai_chat_max_tokens_env) if openai_chat_max_tokens_env else 1024 + ) + openai_reasoning_effort = ( + os.getenv("OPENAI_CHAT_REASONING_EFFORT", "").strip() or None + ) openai_is_fc_model = ( os.getenv("OPENAI_CHAT_IS_FUNCTION_CALLING_MODEL", "false").lower() == "true" @@ -109,13 +116,19 @@ def get_llm_model(): logger.info( f"Initializing OpenAI-compatible chat model: {openai_chat_model} " - f"(base={openai_chat_url}, function_calling={openai_is_fc_model})" + f"(base={openai_chat_url}, max_tokens={openai_chat_max_tokens}, " + f"reasoning_effort={openai_reasoning_effort}, function_calling={openai_is_fc_model})" ) - llm = OpenAILike( + llm = OpenAICompatibleLLM( model=openai_chat_model, api_base=openai_chat_url, api_key=openai_chat_key, + temperature=openai_chat_temperature, + max_tokens=openai_chat_max_tokens, + reasoning_effort=openai_reasoning_effort, + timeout=120.0, + is_function_calling_model=openai_is_fc_model, ) return llm diff --git a/services/rag/llamaindex/helpers/openai_compatible_llm.py b/services/rag/llamaindex/helpers/openai_compatible_llm.py index a8778cf..1cad2c9 100644 --- a/services/rag/llamaindex/helpers/openai_compatible_llm.py +++ b/services/rag/llamaindex/helpers/openai_compatible_llm.py @@ -21,6 +21,8 @@ class OpenAICompatibleLLM(OpenAILike): api_key: str, temperature: float = 0.1, timeout: float = 120.0, + max_tokens: int | None = None, + reasoning_effort: str | None = None, is_function_calling_model: bool = False, ): super().__init__( @@ -29,7 +31,10 @@ class OpenAICompatibleLLM(OpenAILike): api_key=api_key, temperature=temperature, timeout=timeout, + max_tokens=max_tokens, + reasoning_effort=reasoning_effort, # Explicitly avoid "registered model only" assumptions. is_chat_model=True, is_function_calling_model=is_function_calling_model, + should_use_structured_outputs=False, )