sglang_multiturn example fails
System Info
----------Python Info---------- Version : 3.10.12 Compiler : GCC 11.4.0 Build : ('main', 'Jul 29 2024 16:56:48') Arch : ('64bit', 'ELF') ------------Pip Info----------- Version : 25.2 Directory : /usr/local/lib/python3.10/dist-packages/pip vllm : not found. sglang : 0.4.9.post6 ray : 2.47.1 torch : 2.7.1 ----------verl Info----------- Version : 0.7.0.dev Directory : /root/verl/verl Commit Hash : e3b77a6e193b3bd31429910e03c7788fea3db353 ----------Platform Info---------- Platform : Linux-6.5.0-28-generic-x86_64-with-glibc2.35 system : Linux node : b78b4b783b72 release : 6.5.0-28-generic version : #29~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Apr 4 14:39:20 UTC 2 ----------Environment---------- CUDA Runtime : 12.6 CUDA compiler : Not found: [Errno 2] No such file or directory: 'nvcc' ----------System Info---------- CPU Memory : 1007.67 GB GPU Count : 2 GPU 1 Type : NVIDIA A100-SXM4-80GB GPU 1 Memory : 80.00 GB GPU 2 Type : NVIDIA A100-SXM4-80GB GPU 2 Memory : 80.00 GB
Information
- [x] The official example scripts
- [ ] My own modified scripts
Tasks
- [x] An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - [ ] My own task or dataset (give details below)
Reproduction
Change trainer.n_gpus_per_node=8 \ to trainer.n_gpus_per_node=2 \
bash examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh
Expected behavior
This example fails: verl/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh
The error is:
Traceback (most recent call last):
File "/root/verl/verl/trainer/main_ppo.py", line 42, in main
run_ppo(config)
File "/root/verl/verl/trainer/main_ppo.py", line 96, in run_ppo
ray.get(runner.run.remote(config))
File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2849, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 937, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::TaskRunner.run() (pid=39507, ip=10.128.7.2, actor_id=e80d9589afc763d57e8ca1ea01000000, repr=<main_ppo.TaskRunner object at 0x7d38bc0e2b90>)
File "/root/verl/verl/trainer/main_ppo.py", line 341, in run
trainer.fit()
File "/root/verl/verl/trainer/ppo/ray_trainer.py", line 1166, in fit
old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
File "/root/verl/verl/single_controller/ray/base.py", line 48, in __call__
output = ray.get(output)
ray.exceptions.RayTaskError(RuntimeError): ray::WorkerDict.actor_rollout_compute_log_prob() (pid=44170, ip=10.128.7.2, actor_id=b34e74469eec3689e7e7f72301000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7bde5af43a00>)
File "/root/verl/verl/single_controller/ray/base.py", line 700, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
File "/root/verl/verl/single_controller/base/decorator.py", line 442, in inner
return func(*args, **kwargs)
File "/root/verl/verl/utils/transferqueue_utils.py", line 199, in dummy_inner
return func(*args, **kwargs)
File "/root/verl/verl/utils/profiler/profile.py", line 256, in wrapper
return func(self_instance, *args, **kwargs_inner)
File "/root/verl/verl/workers/fsdp_workers.py", line 976, in compute_log_prob
output, entropys = self.actor.compute_log_prob(data=data, calculate_entropy=True)
File "/root/verl/verl/utils/profiler/performance.py", line 105, in f
return self.log(decorated_function, *args, **kwargs)
File "/root/verl/verl/utils/profiler/performance.py", line 118, in log
output = func(*args, **kwargs)
File "/root/verl/verl/workers/actor/dp_actor.py", line 339, in compute_log_prob
entropy, log_probs = self._forward_micro_batch(
File "/root/verl/verl/workers/actor/dp_actor.py", line 170, in _forward_micro_batch
output = self.actor_module(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 856, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py", line 943, in wrapper
output = func(self, *args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py", line 544, in forward
outputs: BaseModelOutputWithPast = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py", line 943, in wrapper
output = func(self, *args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py", line 432, in forward
layer_outputs = decoder_layer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 856, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_layers.py", line 83, in __call__
return super().__call__(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py", line 236, in forward
hidden_states, self_attn_weights = self.self_attn(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py", line 154, in forward
query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 128] because the unspecified dimension size -1 can be any value and is ambiguous
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
I checked and found that some generated sequences have attention_mask being all zero, which is not right.
Have you solved it?
have you solved it