[Bug] SimpleQA测评报错LMEvaluator._init_()缺失参数'judge_cfg'和'output_path'
Prerequisite
- [x] I have searched Issues and Discussions but cannot get the expected help.
- [x] The bug has not been fixed in the latest version.
Type
I'm evaluating with the officially supported tasks/models/datasets.
Environment
{'CUDA available': True, 'CUDA_HOME': '/usr/local/cuda-12.4', 'GCC': 'gcc (GCC) 12.2.0', 'GPU 0,1,2,3,4,5,6,7': 'NVIDIA A800-SXM4-80GB', 'MMEngine': '0.10.7', 'MUSA available': False, 'NVCC': 'Cuda compilation tools, release 12.4, V12.4.131', 'OpenCV': '4.11.0', 'PyTorch': '2.7.1+cu126', 'PyTorch compiling details': 'PyTorch built with:\n' ' - GCC 11.2\n' ' - C++ Version: 201703\n' ' - Intel(R) oneAPI Math Kernel Library Version ' '2024.2-Product Build 20240605 for Intel(R) 64 ' 'architecture applications\n' ' - Intel(R) MKL-DNN v3.7.1 (Git Hash ' '8d263e693366ef8db40acc569cc7d8edf644556d)\n' ' - OpenMP 201511 (a.k.a. OpenMP 4.5)\n' ' - LAPACK is enabled (usually provided by ' 'MKL)\n' ' - NNPACK is enabled\n' ' - CPU capability usage: AVX512\n' ' - CUDA Runtime 12.6\n' ' - NVCC architecture flags: ' '-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n' ' - CuDNN 90.0 (built against CUDA 12.3)\n' ' - Built with CuDNN 90.5.1\n' ' - Magma 2.6.1\n' ' - Build settings: BLAS_INFO=mkl, ' 'BUILD_TYPE=Release, ' 'COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, ' 'CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, ' 'CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, ' 'CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 ' '-fvisibility-inlines-hidden -DUSE_PTHREADPOOL ' '-DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER ' '-DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM ' '-DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK ' '-DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC ' '-Wall -Wextra -Werror=return-type ' '-Werror=non-virtual-dtor ' '-Werror=range-loop-construct ' '-Werror=bool-operation -Wnarrowing ' '-Wno-missing-field-initializers ' '-Wno-unknown-pragmas -Wno-unused-parameter ' '-Wno-strict-overflow -Wno-strict-aliasing ' '-Wno-stringop-overflow -Wsuggest-override ' '-Wno-psabi -Wno-error=old-style-cast ' '-fdiagnostics-color=always -faligned-new ' '-Wno-maybe-uninitialized -fno-math-errno ' '-fno-trapping-math -Werror=format ' '-Wno-stringop-overflow, LAPACK_INFO=mkl, ' 'PERF_WITH_AVX=1, PERF_WITH_AVX2=1, ' 'TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, ' 'USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, ' 'USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, ' 'USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, ' 'USE_OPENMP=ON, USE_ROCM=OFF, ' 'USE_ROCM_KERNEL_ASSERT=OFF, \n', 'Python': '3.10.18 (main, Jun 5 2025, 13:14:17) [GCC 11.2.0]', 'lmdeploy': "not installed:No module named 'lmdeploy'", 'numpy_random_seed': 2147483648, 'opencompass': '0.4.2+', 'sys.platform': 'linux', 'transformers': '4.52.4'}
Reproduces the problem - code/configuration sample
from mmengine.config import read_base
from opencompass.models import OpenAISDK from opencompass.partitioners import SizePartitioner, NaivePartitioner from opencompass.runners import LocalRunner from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask from opencompass.datasets import GaoKaoMATHEvaluator from opencompass.registry import MODELS
with read_base(): from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import drop_datasets from opencompass.configs.datasets.FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets from opencompass.configs.datasets.civilcomments.civilcomments_clp_a3c5fd import civilcomments_datasets from opencompass.configs.datasets.cvalues.cvalues_responsibility_gen_543378 import cvalues_datasets from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import humaneval_datasets from opencompass.configs.datasets.mbpp.mbpp_gen_830460 import mbpp_datasets from opencompass.configs.datasets.gaokao_math.gaokao_math_gen_f5fd28 import gaokao_math_datasets from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets from opencompass.configs.datasets.mmmlu.mmmlu_gen_c51a84 import mmmlu_datasets from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import simpleqa_datasets from opencompass.configs.datasets.math.math_gen_265cce import math_datasets from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
设置全局参数
MODEL_NAME = 'deepseek-ai/DeepSeek-V3-0324' # 请求的模型名称 API_BASE = ... API_KEY = 'EMPTY' JUDGE_MODEL_NAME = 'deepseek-chat' # 请求的模型名称 JUDGE_API_BASE = 'https://api.deepseek.com/v1' JUDGE_API_KEY = ...
#-------------------------------------------数据集预处理----------------------------------------------- datasets = [ drop_datasets, cluewsc_datasets, humaneval_datasets, mbpp_datasets, cvalues_datasets, gaokao_math_datasets, mmlu_datasets, mmmlu_datasets, gpqa_datasets, simpleqa_datasets, math_datasets, gsm8k_datasets ]
#-------------------------------------------推理模型配置----------------------------------------------- api_meta_template = dict( round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True), ], reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')] )
infer = dict( partitioner=dict(type=SizePartitioner, max_task_size=5000), runner=dict( type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)), )
eval = dict( partitioner=dict(type=NaivePartitioner), runner=dict( type=LocalRunner, max_num_workers=32, task=dict(type=OpenICLEvalTask)), )
models = [ dict( type=OpenAISDK, key=API_KEY, openai_api_base=API_BASE, # 服务地址 path=MODEL_NAME, # 请求服务时的 model name tokenizer_path=None, rpm_verbose=True, # 是否打印请求速率 meta_template=api_meta_template, # 服务请求模板 query_per_second=16, # 服务请求速率 max_out_len=8192, # 最大输出长度 max_seq_len=8192, # 最大输入长度 temperature=0.6, # 生成温度 batch_size=8, # 批处理大小 extra_body={'chat_template_kwargs': {'enable_thinking': False}}, retry=3, # 重试次数 ) ]
datasets = simpleqa_datasets
Reproduces the problem - command or script
opencompass run_opencompass_test.py -w ./outputs/tests
Reproduces the problem - error message
Other information
加上下面这段后能解决,但会报新的错 verifier_cfg = dict( abbr=JUDGE_MODEL_NAME, type=OpenAISDK, path=JUDGE_MODEL_NAME, # 需替换实际路径 key=JUDGE_API_KEY, # 需替换真实API Key openai_api_base=[JUDGE_API_BASE], # 需替换API地址 query_per_second=16, batch_size=1024, temperature=0.001, max_out_len=16384 )
for item in datasets: if item[0]["abbr"] == "simpleqa": item[0]['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg item[0]['eval_cfg']['evaluator']['output_path'] = './outputs/tests/'
新报错:
请问这个问题后来解决了吗
遇到了同样的问题
遇到了同样的问题
试试参照这个issue: 1590,不过我用的是旧版本的compass,我测试仍然是可以执行的
from mmengine.config import read_base
with read_base():
# from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import models
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b_instruct import models as lmdeploy_qwen2_5_1_5b_instruct
# from opencompass.configs.models.qwen2_5.vllm_qwen2_5_1_5b_instruct import models as vllm_qwen2_5_1_5b_instruct
# from opencompass.configs.models.qwen2_5.hf_qwen2_5_1_5b_instruct import models as hf_qwen2_5_1_5b_instruct
from opencompass.configs.datasets.subjective.flames.flames_gen import flames_datasets
from opencompass.models.huggingface_above_v4_33 import HuggingFacewithChatTemplate # 不要放在read_base内,有bug
from opencompass.models import OpenAISDK
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import FlamesSummarizer
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
datasets = [*flames_datasets]
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='qwen2.5-1.5b-instruct-hf',
path='/space/llms/Qwen2.5-1.5B-Instruct',
generation_kwargs=dict(
do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
),
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
)
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)),
)
judge_models = [
dict(
abbr='Qwen3-8B',
type=OpenAISDK,
key='EMPTY', # API key
openai_api_base='http://ip:port/v1/', # 服务地址
path='Qwen3-8B', # 请求服务时的 model name
tokenizer_path='/space/llms/Qwen3-8B/', # 请求服务时的 tokenizer name 或 path, 为None时使用默认tokenizer gpt-4
rpm_verbose=True, # 是否打印请求速率
meta_template=api_meta_template, # 服务请求模板
query_per_second=1, # 服务请求速率
max_out_len=10240, # 最大输出长度
max_seq_len=15000, # 最大输入长度
temperature=0.2, # 生成温度
batch_size=8, # 批处理大小
retry=3, # 重试次数
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(type=SubjectiveNaivePartitioner, models=models, judge_models=judge_models,),
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=FlamesSummarizer)
work_dir = 'outputs/demo/'
接上条,目前两种评估方式
python run.py configs/eval_flame_demo.py \
-w outputs/demo \
--debug
会触发 opencompass/tasks/subjective_eval.py 中的 _score() 方法,这里面后处理得到issue报错里的 'judgements',所以流程是成功的
另一种评估方式,貌似在被弃用,即
python run.py \
--hf-type base --hf-path /space/llms/Qwen2-1.5B-Instruct \
--datasets flames_gen \
-w outputs/demo \
--debug
会触发 opencompass/tasks/openicl_eval.py 中的 _score() 方法,这里面没有做后处理,所以报错了,至于调整下代码后是否能直接使用,还在调试中……