From 5721bad117103fc2e5a47ddebba40f3ae488dc67 Mon Sep 17 00:00:00 2001
From: idchlife <idchlife@gmail.com>
Date: Mon, 9 Mar 2026 10:21:39 +0300
Subject: [PATCH] fix for openai compatibility model with needed parameters

---
 services/rag/llamaindex/.env.dist             |  3 +++
 services/rag/llamaindex/chat_engine.py        | 14 +++++++++++++
 services/rag/llamaindex/config.py             | 21 +++++++++++++++----
 .../helpers/openai_compatible_llm.py          |  5 +++++
 4 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/services/rag/llamaindex/.env.dist b/services/rag/llamaindex/.env.dist
index d6a3dd5..657f4e8 100644
--- a/services/rag/llamaindex/.env.dist
+++ b/services/rag/llamaindex/.env.dist
@@ -14,6 +14,9 @@ QDRANT_GRPC_PORT=6334
 # OpenAI Configuration (for reference - uncomment and configure when using OpenAI strategy)
 # OPENAI_CHAT_URL=https://api.openai.com/v1
 # OPENAI_CHAT_KEY=your_openai_api_key_here
+# OPENAI_CHAT_TEMPERATURE=0.1
+# OPENAI_CHAT_MAX_TOKENS=1024
+# OPENAI_CHAT_REASONING_EFFORT=low
 # OPENAI_CHAT_IS_FUNCTION_CALLING_MODEL=false
 # OPENAI_EMBEDDING_MODEL=text-embedding-3-small
 # OPENAI_EMBEDDING_BASE_URL=https://api.openai.com/v1
diff --git a/services/rag/llamaindex/chat_engine.py b/services/rag/llamaindex/chat_engine.py
index 162ad70..b6c879b 100644
--- a/services/rag/llamaindex/chat_engine.py
+++ b/services/rag/llamaindex/chat_engine.py
@@ -17,6 +17,7 @@ from typing import Any, Iterable, List
 
 from llama_index.core import PromptTemplate
 from llama_index.core.agent import AgentWorkflow
+from llama_index.core.base.llms.types import ChatMessage, MessageRole
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.core.schema import NodeWithScore
 from llama_index.core.tools import FunctionTool
@@ -230,6 +231,19 @@ def synthesize_answer(query: str, sources: list[dict[str, Any]], agent_draft: st
         context_json=context_json,
     )
     logger.info("Synthesizing grounded answer from retrieved sources")
+    # Prefer chat API for chat-capable models; fallback to completion if unavailable.
+    try:
+        if hasattr(llm, "chat"):
+            chat_response = llm.chat(
+                [
+                    ChatMessage(role=MessageRole.SYSTEM, content="You answer with grounded citations only."),
+                    ChatMessage(role=MessageRole.USER, content=prompt),
+                ]
+            )
+            return _normalize_text(getattr(chat_response, "message", chat_response).content)
+    except Exception as e:
+        logger.warning(f"LLM chat synthesis failed, falling back to completion: {e}")
+
     response = llm.complete(prompt)
     return _normalize_text(getattr(response, "text", response))
 
diff --git a/services/rag/llamaindex/config.py b/services/rag/llamaindex/config.py
index e948e90..b2a88ba 100644
--- a/services/rag/llamaindex/config.py
+++ b/services/rag/llamaindex/config.py
@@ -93,12 +93,19 @@ def get_llm_model():
         return llm
 
     elif strategy == "openai":
-        from llama_index.llms.openai_like import OpenAILike
-        # from helpers.openai_compatible_llm import OpenAICompatibleLLM
+        from helpers.openai_compatible_llm import OpenAICompatibleLLM
 
         openai_chat_url = os.getenv("OPENAI_CHAT_URL", "https://api.openai.com/v1")
         openai_chat_key = os.getenv("OPENAI_CHAT_KEY", "dummy_key_for_template")
         openai_chat_model = os.getenv("OPENAI_CHAT_MODEL", "gpt-3.5-turbo")
+        openai_chat_temperature = float(os.getenv("OPENAI_CHAT_TEMPERATURE", "0.1"))
+        openai_chat_max_tokens_env = os.getenv("OPENAI_CHAT_MAX_TOKENS", "").strip()
+        openai_chat_max_tokens = (
+            int(openai_chat_max_tokens_env) if openai_chat_max_tokens_env else 1024
+        )
+        openai_reasoning_effort = (
+            os.getenv("OPENAI_CHAT_REASONING_EFFORT", "").strip() or None
+        )
         openai_is_fc_model = (
             os.getenv("OPENAI_CHAT_IS_FUNCTION_CALLING_MODEL", "false").lower()
             == "true"
@@ -109,13 +116,19 @@ def get_llm_model():
 
         logger.info(
             f"Initializing OpenAI-compatible chat model: {openai_chat_model} "
-            f"(base={openai_chat_url}, function_calling={openai_is_fc_model})"
+            f"(base={openai_chat_url}, max_tokens={openai_chat_max_tokens}, "
+            f"reasoning_effort={openai_reasoning_effort}, function_calling={openai_is_fc_model})"
         )
 
-        llm = OpenAILike(
+        llm = OpenAICompatibleLLM(
             model=openai_chat_model,
             api_base=openai_chat_url,
             api_key=openai_chat_key,
+            temperature=openai_chat_temperature,
+            max_tokens=openai_chat_max_tokens,
+            reasoning_effort=openai_reasoning_effort,
+            timeout=120.0,
+            is_function_calling_model=openai_is_fc_model,
         )
 
         return llm
diff --git a/services/rag/llamaindex/helpers/openai_compatible_llm.py b/services/rag/llamaindex/helpers/openai_compatible_llm.py
index a8778cf..1cad2c9 100644
--- a/services/rag/llamaindex/helpers/openai_compatible_llm.py
+++ b/services/rag/llamaindex/helpers/openai_compatible_llm.py
@@ -21,6 +21,8 @@ class OpenAICompatibleLLM(OpenAILike):
         api_key: str,
         temperature: float = 0.1,
         timeout: float = 120.0,
+        max_tokens: int | None = None,
+        reasoning_effort: str | None = None,
         is_function_calling_model: bool = False,
     ):
         super().__init__(
@@ -29,7 +31,10 @@ class OpenAICompatibleLLM(OpenAILike):
             api_key=api_key,
             temperature=temperature,
             timeout=timeout,
+            max_tokens=max_tokens,
+            reasoning_effort=reasoning_effort,
             # Explicitly avoid "registered model only" assumptions.
             is_chat_model=True,
             is_function_calling_model=is_function_calling_model,
+            should_use_structured_outputs=False,
         )