AspectCritic not working with openai o3
[ ] I have checked the documentation and related resources and couldn't resolve my bug.
Describe the bug I am using AspectCritic metric with custom settings of the evaluator model. When changing from gpt-4o to o3, I got runtime errors
Ragas version: 0.2.13 Python version: 3.11.2
Code to Reproduce from ragas import SingleTurnSample from ragas.metrics import AspectCritic from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI
prompt = 'How much is 2+2?'
##########################################################
##################################
this openai_llm works correctly as evaluator_llm
openai_llm = ChatOpenAI(model="gpt-4o", temperature = 1, max_tokens = 10000)
##################################
this openai_llm does not works as evaluator_llm
openai_llm = ChatOpenAI(model="o3", temperature = 1, max_tokens = 10000)
########################
response = openai_llm.invoke(prompt) print(response.content)
##########################################################
evaluator_llm = LangchainLLMWrapper(openai_llm)
critic = AspectCritic( name="math_accuracy", definition="Is the mathematical operation done correctly?", llm=evaluator_llm, strictness = 3 )
sample = SingleTurnSample( user_input = prompt, response = response.content, retrieved_contexts=[] )
aspect_critic_score = critic.single_turn_score(sample) print(aspect_critic_score)
Error trace
BadRequestError Traceback (most recent call last) Cell In[7], line 28 15 critic = AspectCritic( 16 name="math_accuracy", 17 definition="Is the mathematical operation done correctly?", 18 llm=evaluator_llm, 19 strictness = 3 20 ) 22 sample = SingleTurnSample( 23 user_input = prompt, 24 response = response.content, 25 retrieved_contexts=[] 26 ) ---> 28 aspect_critic_score = critic.single_turn_score(sample) 29 print(aspect_critic_score)
File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/base.py:497, in SingleTurnMetric.single_turn_score(self, sample, callbacks) 495 if not group_cm.ended: 496 rm.on_chain_error(e) --> 497 raise e 498 else: 499 if not group_cm.ended:
File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/base.py:491, in SingleTurnMetric.single_turn_score(self, sample, callbacks)
487 raise ImportError(
488 "It seems like your running this in a jupyter-like environment. Please install nest_asyncio with pip install nest_asyncio to make it work."
489 )
490 loop = asyncio.get_event_loop()
--> 491 score = loop.run_until_complete(
492 self._single_turn_ascore(sample=sample, callbacks=group_cm)
493 )
494 except Exception as e:
495 if not group_cm.ended:
File ~/env-nlp/lib/python3.11/site-packages/nest_asyncio.py:98, in _patch_loop.
File /usr/lib/python3.11/asyncio/futures.py:203, in Future.result(self) 201 self.__log_traceback = False 202 if self._exception is not None: --> 203 raise self._exception.with_traceback(self._exception_tb) 204 return self._result
File /usr/lib/python3.11/asyncio/tasks.py:267, in Task.__step(failed resolving arguments)
263 try:
264 if exc is None:
265 # We use the send method directly, because coroutines
266 # don't have __iter__ and __next__ methods.
--> 267 result = coro.send(None)
268 else:
269 result = coro.throw(exc)
File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/_aspect_critic.py:171, in AspectCritic._single_turn_ascore(self, sample, callbacks) 167 async def _single_turn_ascore( 168 self, sample: SingleTurnSample, callbacks: Callbacks 169 ) -> float: 170 row = sample.to_dict() --> 171 return await self._ascore(row, callbacks)
File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/_aspect_critic.py:190, in AspectCritic._ascore(self, row, callbacks) 180 reference_contexts = row.get("reference_contexts") 182 prompt_input = AspectCriticInput( 183 user_input=user_input, 184 response=response, (...) 187 reference_contexts=reference_contexts, 188 ) --> 190 response = await self.single_turn_prompt.generate( 191 data=prompt_input, 192 llm=self.llm, 193 callbacks=callbacks, 194 ) 196 return self._compute_score([response])
File ~/env-nlp/lib/python3.11/site-packages/ragas/prompt/pydantic_prompt.py:129, in PydanticPrompt.generate(self, llm, data, temperature, stop, callbacks, retries_left) 126 callbacks = callbacks or [] 128 # this is just a special case of generate_multiple --> 129 output_single = await self.generate_multiple( 130 llm=llm, 131 data=data, 132 n=1, 133 temperature=temperature, 134 stop=stop, 135 callbacks=callbacks, 136 retries_left=retries_left, 137 ) 138 return output_single[0]
File ~/env-nlp/lib/python3.11/site-packages/ragas/prompt/pydantic_prompt.py:190, in PydanticPrompt.generate_multiple(self, llm, data, n, temperature, stop, callbacks, retries_left) 183 prompt_rm, prompt_cb = new_group( 184 name=self.name, 185 inputs={"data": processed_data}, 186 callbacks=callbacks, 187 metadata={"type": ChainType.RAGAS_PROMPT}, 188 ) 189 prompt_value = PromptValue(text=self.to_string(processed_data)) --> 190 resp = await llm.generate( 191 prompt_value, 192 n=n, 193 temperature=temperature, 194 stop=stop, 195 callbacks=prompt_cb, 196 ) 198 output_models = [] 199 parser = RagasOutputParser(pydantic_object=self.output_model)
File ~/env-nlp/lib/python3.11/site-packages/ragas/llms/base.py:108, in BaseRagasLLM.generate(self, prompt, n, temperature, stop, callbacks) 103 temperature = self.get_temperature(n) 105 agenerate_text_with_retry = add_async_retry( 106 self.agenerate_text, self.run_config 107 ) --> 108 result = await agenerate_text_with_retry( 109 prompt=prompt, 110 n=n, 111 temperature=temperature, 112 stop=stop, 113 callbacks=callbacks, 114 ) 116 # check there are no max_token issues 117 if not self.is_finished(result):
File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/init.py:189, in AsyncRetrying.wraps.
File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/init.py:111, in AsyncRetrying.call(self, fn, *args, **kwargs) 109 retry_state = RetryCallState(retry_object=self, fn=fn, args=args, kwargs=kwargs) 110 while True: --> 111 do = await self.iter(retry_state=retry_state) 112 if isinstance(do, DoAttempt): 113 try:
File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/init.py:153, in AsyncRetrying.iter(self, retry_state) 151 result = None 152 for action in self.iter_state.actions: --> 153 result = await action(retry_state) 154 return result
File ~/env-nlp/lib/python3.11/site-packages/tenacity/_utils.py:99, in wrap_to_async_func.
File ~/env-nlp/lib/python3.11/site-packages/tenacity/init.py:398, in BaseRetrying._post_retry_check_actions.
File /usr/lib/python3.11/concurrent/futures/_base.py:449, in Future.result(self, timeout) 447 raise CancelledError() 448 elif self._state == FINISHED: --> 449 return self.__get_result() 451 self._condition.wait(timeout) 453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
File /usr/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self) 399 if self._exception: 400 try: --> 401 raise self._exception 402 finally: 403 # Break a reference cycle with the exception in self._exception 404 self = None
File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/init.py:114, in AsyncRetrying.call(self, fn, *args, **kwargs) 112 if isinstance(do, DoAttempt): 113 try: --> 114 result = await fn(*args, **kwargs) 115 except BaseException: # noqa: B902 116 retry_state.set_exception(sys.exc_info()) # type: ignore[arg-type]
File ~/env-nlp/lib/python3.11/site-packages/ragas/llms/base.py:253, in LangchainLLMWrapper.agenerate_text(self, prompt, n, temperature, stop, callbacks) 251 if hasattr(self.langchain_llm, "n"): 252 self.langchain_llm.n = n # type: ignore --> 253 result = await self.langchain_llm.agenerate_prompt( 254 prompts=[prompt], 255 stop=stop, 256 callbacks=callbacks, 257 ) 258 else: 259 result = await self.langchain_llm.agenerate_prompt( 260 prompts=[prompt] * n, 261 stop=stop, 262 callbacks=callbacks, 263 )
File ~/env-nlp/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:905, in BaseChatModel.agenerate_prompt(self, prompts, stop, callbacks, **kwargs) 896 @override 897 async def agenerate_prompt( 898 self, (...) 902 **kwargs: Any, 903 ) -> LLMResult: 904 prompt_messages = [p.to_messages() for p in prompts] --> 905 return await self.agenerate( 906 prompt_messages, stop=stop, callbacks=callbacks, **kwargs 907 )
File ~/env-nlp/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:863, in BaseChatModel.agenerate(self, messages, stop, callbacks, tags, metadata, run_name, run_id, **kwargs) 850 if run_managers: 851 await asyncio.gather( 852 *[ 853 run_manager.on_llm_end( (...) 861 ] 862 ) --> 863 raise exceptions[0] 864 flattened_outputs = [ 865 LLMResult(generations=[res.generations], llm_output=res.llm_output) # type: ignore[list-item, union-attr] 866 for res in results 867 ] 868 llm_output = self._combine_llm_outputs([res.llm_output for res in results]) # type: ignore[union-attr]
File /usr/lib/python3.11/asyncio/tasks.py:267, in Task.__step(failed resolving arguments)
263 try:
264 if exc is None:
265 # We use the send method directly, because coroutines
266 # don't have __iter__ and __next__ methods.
--> 267 result = coro.send(None)
268 else:
269 result = coro.throw(exc)
File ~/env-nlp/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:1033, in BaseChatModel._agenerate_with_cache(self, messages, stop, run_manager, **kwargs) 1031 else: 1032 if inspect.signature(self._agenerate).parameters.get("run_manager"): -> 1033 result = await self._agenerate( 1034 messages, stop=stop, run_manager=run_manager, **kwargs 1035 ) 1036 else: 1037 result = await self._agenerate(messages, stop=stop, **kwargs)
File ~/env-nlp/lib/python3.11/site-packages/langchain_openai/chat_models/base.py:960, in BaseChatOpenAI._agenerate(self, messages, stop, run_manager, **kwargs) 958 generation_info = {"headers": dict(raw_response.headers)} 959 else: --> 960 response = await self.async_client.create(**payload) 961 return await run_in_executor( 962 None, self._create_chat_result, response, generation_info 963 )
File ~/env-nlp/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py:2028, in AsyncCompletions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, web_search_options, extra_headers, extra_query, extra_body, timeout) 1985 @required_args(["messages", "model"], ["messages", "model", "stream"]) 1986 async def create( 1987 self, (...) 2025 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, 2026 ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]: 2027 validate_response_format(response_format) -> 2028 return await self._post( 2029 "/chat/completions", 2030 body=await async_maybe_transform( 2031 { 2032 "messages": messages, 2033 "model": model, 2034 "audio": audio, 2035 "frequency_penalty": frequency_penalty, 2036 "function_call": function_call, 2037 "functions": functions, 2038 "logit_bias": logit_bias, 2039 "logprobs": logprobs, 2040 "max_completion_tokens": max_completion_tokens, 2041 "max_tokens": max_tokens, 2042 "metadata": metadata, 2043 "modalities": modalities, 2044 "n": n, 2045 "parallel_tool_calls": parallel_tool_calls, 2046 "prediction": prediction, 2047 "presence_penalty": presence_penalty, 2048 "reasoning_effort": reasoning_effort, 2049 "response_format": response_format, 2050 "seed": seed, 2051 "service_tier": service_tier, 2052 "stop": stop, 2053 "store": store, 2054 "stream": stream, 2055 "stream_options": stream_options, 2056 "temperature": temperature, 2057 "tool_choice": tool_choice, 2058 "tools": tools, 2059 "top_logprobs": top_logprobs, 2060 "top_p": top_p, 2061 "user": user, 2062 "web_search_options": web_search_options, 2063 }, 2064 completion_create_params.CompletionCreateParamsStreaming 2065 if stream 2066 else completion_create_params.CompletionCreateParamsNonStreaming, 2067 ), 2068 options=make_request_options( 2069 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout 2070 ), 2071 cast_to=ChatCompletion, 2072 stream=stream or False, 2073 stream_cls=AsyncStream[ChatCompletionChunk], 2074 )
File ~/env-nlp/lib/python3.11/site-packages/openai/_base_client.py:1742, in AsyncAPIClient.post(self, path, cast_to, body, files, options, stream, stream_cls) 1728 async def post( 1729 self, 1730 path: str, (...) 1737 stream_cls: type[_AsyncStreamT] | None = None, 1738 ) -> ResponseT | _AsyncStreamT: 1739 opts = FinalRequestOptions.construct( 1740 method="post", url=path, json_data=body, files=await async_to_httpx_files(files), **options 1741 ) -> 1742 return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
File ~/env-nlp/lib/python3.11/site-packages/openai/_base_client.py:1549, in AsyncAPIClient.request(self, cast_to, options, stream, stream_cls) 1546 await err.response.aread() 1548 log.debug("Re-raising status error") -> 1549 raise self._make_status_error_from_response(err.response) from None 1551 break 1553 assert response is not None, "could not resolve response (should never happen)"
BadRequestError: Error code: 400 - {'error': {'message': "Unsupported value: 'temperature' does not support 1E-8 with this model. Only the default (1) value is supported.", 'type': 'invalid_request_error', 'param': 'temperature', 'code': 'unsupported_value'}}
Expected behavior I have explicitly set the temperature to 1, which is the only value accepted by o3:
openai_llm = ChatOpenAI(model="o3", temperature = 1, max_tokens = 10000)
However, Ragas sets internally temperature of the model to 1e-8 ignoring my setting, which causes runtime errors.
same here. I'm using this workaround
from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper
llm = ChatOpenAI(model_name="o4-mini", temperature=1)
class ReasoningOpenAILLM(LangchainLLMWrapper): def get_temperature(self, n: int) -> float: return n
class ReasoningOpenAILLM(LangchainLLMWrapper): def get_temperature(self, n: int) -> float: # force 1 return n
llm = ReasoningOpenAILLM(llm)
same here. I'm using this workaround
from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper
llm = ChatOpenAI(model_name="o4-mini", temperature=1)
class ReasoningOpenAILLM(LangchainLLMWrapper): def get_temperature(self, n: int) -> float: return n
class ReasoningOpenAILLM(LangchainLLMWrapper): def get_temperature(self, n: int) -> float: # force 1 return n
llm = ReasoningOpenAILLM(llm)
Hi! Is there anything else to this workaround? I tried implementing it here but the error persisted.
same here. I'm using this workaround
from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper
llm = ChatOpenAI(model_name="o4-mini", temperature=1)
class ReasoningOpenAILLM(LangchainLLMWrapper): def get_temperature(self, n: int) -> float: return n
class ReasoningOpenAILLM(LangchainLLMWrapper): def get_temperature(self, n: int) -> float: # force 1 return n
llm = ReasoningOpenAILLM(llm)
Thanks, that worked for me when using gpt-5 model