verl
verl copied to clipboard
vllm version error about Ascend NPU
System Info
i try to run vllm+npu training script: examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh
# run on Ascend 910
# make sure your current working directory is the root of the project
set -x
ulimit -n 65535
#set vllm v1 env
export VLLM_USE_V1=1
PROJECT_DIR="$(pwd)"
CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
TRAIN_BATCH_SIZE=32
MICRO_BATCH_SIZE=8
python3 -m verl.trainer.main_ppo \
--config-path="$CONFIG_PATH" \
--config-name='gsm8k_multiturn_grpo' \
actor_rollout_ref.rollout.name=vllm \
algorithm.adv_estimator=grpo \
data.train_batch_size=${TRAIN_BATCH_SIZE} \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.return_raw_chat=True \
actor_rollout_ref.model.path="/home/Qwen2.5-3B-Instruct" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=${TRAIN_BATCH_SIZE} \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.n=8 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.9\
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.project_name='gsm8k_async_rl' \
trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-sgl-multi-w-tool-verify-n16' \
trainer.device=npu \
trainer.n_gpus_per_node=4 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=20 \
trainer.logger='["console"]' \
data.train_files=/home/data/gsm8k/train.parquet \
data.val_files=/home/data/gsm8k/test.parquet \
trainer.total_epochs=15 \
actor_rollout_ref.rollout.update_weights_bucket_megabytes=512 \
actor_rollout_ref.rollout.trace.token2text=False \
actor_rollout_ref.rollout.mode=async \
actor_rollout_ref.rollout.multi_turn.enable=true \
actor_rollout_ref.rollout.enforce_eager=True \
actor_rollout_ref.actor.use_torch_compile=False \
actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \
actor_rollout_ref.rollout.free_cache_engine=True
Information
- [x] The official example scripts
- [ ] My own modified scripts
Tasks
- [x] An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - [ ] My own task or dataset (give details below)
Reproduction
the error information is:
Traceback (most recent call last):
File "/verl/verl/trainer/main_ppo.py", line 42, in main
run_ppo(config)
File "/verl/verl/trainer/main_ppo.py", line 96, in run_ppo
ray.get(runner.run.remote(config))
File "/usr/local/python3.11.13/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/python3.11.13/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/python3.11.13/lib/python3.11/site-packages/ray/_private/worker.py", line 2822, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/python3.11.13/lib/python3.11/site-packages/ray/_private/worker.py", line 930, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AttributeError): ray::TaskRunner.run() (pid=312910, ip=10.1.68.108, actor_id=1d2b384df857f2b3e9ff165001000000, repr=<main_ppo.TaskRunner object at 0xfffc01112cd0>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/verl/verl/trainer/main_ppo.py", line 339, in run
trainer.init_workers()
File "/verl/verl/trainer/ppo/ray_trainer.py", line 774, in init_workers
self.async_rollout_manager = AgentLoopManager(
^^^^^^^^^^^^^^^^^
File "/verl/verl/experimental/agent_loop/agent_loop.py", line 688, in __init__
self.sleep()
File "/verl/verl/experimental/agent_loop/agent_loop.py", line 801, in sleep
self._run_all([replica.sleep() for replica in self.rollout_replicas])
File "/verl/verl/experimental/agent_loop/agent_loop.py", line 807, in _run_all
asyncio.run(run_all())
File "/usr/local/python3.11.13/lib/python3.11/asyncio/runners.py", line 190, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/local/python3.11.13/lib/python3.11/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/verl/verl/experimental/agent_loop/agent_loop.py", line 805, in run_all
await asyncio.gather(*tasks)
File "/verl/verl/workers/rollout/vllm_rollout/vllm_async_server.py", line 549, in sleep
await self.servers[0].wait_for_requests_to_drain.remote()
ray.exceptions.RayTaskError(AttributeError): ray::vLLMHttpServer.wait_for_requests_to_drain() (pid=324161, ip=10.1.68.108, actor_id=7455a93360f01a31e1cf66ac01000000, repr=<verl.workers.rollout.vllm_rollout.vllm_async_server.vLLMHttpServer object at 0xffffb18750d0>)
File "/usr/local/python3.11.13/lib/python3.11/concurrent/futures/_base.py", line 449, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File "/usr/local/python3.11.13/lib/python3.11/concurrent/futures/_base.py", line 401, in __get_result
raise self._exception
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/verl/verl/workers/rollout/vllm_rollout/vllm_async_server.py", line 431, in wait_for_requests_to_drain
await self.engine.wait_for_requests_to_drain()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'AsyncLLM' object has no attribute 'wait_for_requests_to_drain'
Expected behavior
official examples scripts should run successfully
Please provide detailed environment information.