verl
verl copied to clipboard
ray.exceptions.RayTaskError(CompilationError):
Thanks for the lib! When I run the script verl/examples/grpo_trainer/run_qwen3-8b.sh, a RayTaskError occurs. I have checked that the Ray version is (2.43.0), triton(3.1.0),vllm(0.8.5),verl(0.4.0),but this error is not sporadic—it happens every time. Could you please tell me what might be causing this issue?
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/usr/local/lib/python3.12/site-packages/verl/trainer/main_ppo.py", line 222, in <module>
main()
File "/usr/local/lib/python3.12/site-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/usr/local/lib/python3.12/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/usr/local/lib/python3.12/site-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/usr/local/lib/python3.12/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/usr/local/lib/python3.12/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
^^^^^^
File "/usr/local/lib/python3.12/site-packages/hydra/_internal/utils.py", line 458, in <lambda>
lambda: hydra.run(
^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/usr/local/lib/python3.12/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/verl/trainer/main_ppo.py", line 27, in main
run_ppo(config)
File "/usr/local/lib/python3.12/site-packages/verl/trainer/main_ppo.py", line 39, in run_ppo
ray.get(runner.run.remote(config))
File "/usr/local/lib/python3.12/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/ray/_private/worker.py", line 2771, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/ray/_private/worker.py", line 919, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(CompilationError): ray::TaskRunner.run() (pid=842074, ip=10.193.80.54, actor_id=904189ba5169669a015aca0701000000, repr=<main_ppo.TaskRunner object at 0x7f166725a9f0>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/verl/trainer/main_ppo.py", line 159, in run
trainer.fit()
File "/usr/local/lib/python3.12/site-packages/verl/trainer/ppo/ray_trainer.py", line 971, in fit
val_metrics = self._validate()
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/verl/trainer/ppo/ray_trainer.py", line 701, in _validate
test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/verl/single_controller/ray/base.py", line 51, in __call__
output = ray.get(output)
^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ray.exceptions.RayTaskError(CompilationError): ray::WorkerDict.actor_rollout_generate_sequences() (pid=850420, ip=10.193.80.54, actor_id=774961469d6b6f473a8d43d501000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7eda9424a720>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/verl/single_controller/ray/base.py", line 645, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/verl/single_controller/base/decorator.py", line 540, in inner
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/verl/workers/fsdp_workers.py", line 627, in generate_sequences
output = self.rollout.generate_sequences(prompts=prompts)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/verl/utils/debug/performance.py", line 75, in f
return self.log(decorated_function, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/verl/utils/debug/performance.py", line 85, in log
output = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py", line 288, in generate_sequences
outputs = self.inference_engine.generate(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/utils.py", line 1196, in inner
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/entrypoints/llm.py", line 473, in generate
outputs = self._run_engine(use_tqdm=use_tqdm)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/entrypoints/llm.py", line 1423, in _run_engine
step_outputs = self.llm_engine.step()
^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/engine/llm_engine.py", line 1412, in step
outputs = self.model_executor.execute_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 140, in execute_model
output = self.collective_rpc("execute_model",
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/utils.py", line 2456, in run_method
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 420, in execute_model
output = self.model_runner.execute_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/worker/model_runner.py", line 1764, in execute_model
hidden_or_intermediate_states = model_executable(
^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/models/qwen3.py", line 299, in forward
hidden_states = self.model(input_ids, positions, intermediate_tensors,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/compilation/decorators.py", line 172, in __call__
return self.forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/models/qwen2.py", line 343, in forward
hidden_states, residual = layer(
^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/models/qwen3.py", line 212, in forward
hidden_states = self.self_attn(
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/models/qwen3.py", line 143, in forward
attn_output = self.attn(q, k, v)
^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/attention/layer.py", line 233, in forward
return torch.ops.vllm.unified_attention(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/_ops.py", line 1123, in __call__
return self._op(*args, **(kwargs or {}))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/attention/layer.py", line 379, in unified_attention
output = self.impl.forward(self, query, key, value, kv_cache,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/attention/backends/xformers.py", line 578, in forward
out = PagedAttention.forward_prefix(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/attention/ops/paged_attn.py", line 213, in forward_prefix
context_attention_fwd(
File "/usr/local/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/vllm/attention/ops/prefix_prefill.py", line 850, in context_attention_fwd
_fwd_kernel[grid](
File "/usr/local/lib/python3.12/site-packages/triton/runtime/jit.py", line 345, in <lambda>
return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/triton/runtime/jit.py", line 662, in run
kernel = self.compile(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/triton/compiler/compiler.py", line 276, in compile
module = src.make_ir(options, codegen_fns, context)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/triton/compiler/compiler.py", line 113, in make_ir
return ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
triton.compiler.errors.CompilationError: at 99:4:
q = tl.load(Q + off_q,
mask=dim_mask[None, :] &
(offs_m[:, None] < cur_batch_query_len),
other=0.0) # [M,D]
# initialize pointer to m and l
m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32) # [M,D]
# compute query against context (no causal mask here)
for start_n in tl.range(0, cur_batch_ctx_len, BLOCK_SIZE, \
^
TypeError("range.__init__() got an unexpected keyword argument 'loop_unroll_factor'")
(WorkerDict pid=850422) kwargs: {'n': 4, 'logprobs': 0, 'max_tokens': 1024, 'detokenize': False, 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'ignore_eos': False} [repeated 7x across cluster]
(WorkerDict pid=850415) WARNING 07-08 09:58:43 [prefix_caching_block.py:456] Failed to reset prefix cache because some blocks (232) are not freed yet [repeated 7x across cluster]
(WorkerDict pid=850422) /usr/local/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html . [repeated 7x across cluster]
(WorkerDict pid=850422) warnings.warn( [repeated 7x across cluster]
verl/examples/grpo_trainer/run_qwen3-8b.sh:
export HYDRA_FULL_ERROR=1
export RAY_IGNORE_UNHANDLED_ERRORS=1
HF_MODEL_PATH=/model/Qwen3-4B
export CUDA_VISIBLE_DEVICES=8,9,10,11,12,13,14,15
export VLLM_ATTENTION_BACKEND=XFORMERS
set -x
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/mnt/workspace/code/verl/gsm8k/train.parquet \
data.val_files=/mnt/workspace/code/verl/gsm8k/test.parquet \
data.train_batch_size=8 \
data.max_prompt_length=256 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=$HF_MODEL_PATH \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=4 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.n=4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger=['console','tensorboard'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen3_8b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
Issue Severity
High: It blocks me from completing my task.
met the same problem, do you have any idea about how to fix it?
Encountering the same issue
any progress?