I'm trying to apply deepspeed stage 2 to stylegan2 but I get this error.
Here's my config:
{
"train_batch_size" : 4,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0002,
"betas": [
0.5,
0.999
],
"eps": 1e-8
}
},
"steps_per_print" : 10,
"fp16": {
"enabled": true
},
"zero_optimization": {
"stage": 2,
"cpu_offload": true,
"contiguous_gradients": true,
"overlap_comm": true
}
}
And here's the full stack trace:
Traceback (most recent call last):
File "stylegan2_pytorch/ucl_deepspeed.py", line 200, in <module>
main()
File "stylegan2_pytorch/ucl_deepspeed.py", line 197, in main
train_from_folder(deepspeed_args=deepspeed_args)
File "stylegan2_pytorch/ucl_deepspeed.py", line 177, in train_from_folder
run_training(0, 1, model_args, data, load_from, new, num_train_steps, name, seed)
File "stylegan2_pytorch/ucl_deepspeed.py", line 62, in run_training
retry_call(model.train, tries=3, exceptions=NanException)
File "/opt/conda/lib/python3.7/site-packages/retry/api.py", line 101, in retry_call
return __retry_internal(partial(f, *args, **kwargs), exceptions, tries, delay, max_delay, backoff, jitter, logger)
File "/opt/conda/lib/python3.7/site-packages/retry/api.py", line 33, in __retry_internal
return f()
File "/home/dtkatch/stylegan2-pytorch/stylegan2_pytorch/stylegan2_pytorch.py", line 1052, in train
self.GAN.model_engineG.backward(gen_loss)
File "/opt/conda/lib/python3.7/site-packages/deepspeed/runtime/engine.py", line 845, in backward
self.optimizer.backward(loss)
File "/opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage2.py", line 1609, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.7/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 53, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/opt/conda/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
File "/opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage2.py", line 594, in reduce_partition_and_remove_grads
self.reduce_ready_partitions_and_remove_grads(param, i)
File "/opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage2.py", line 984, in reduce_ready_partitions_and_remove_grads
self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
File "/opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/stage2.py", line 637, in reduce_independent_p_g_buckets_and_remove_grads
new_grad_tensor.copy_(param.grad.view(-1))
AttributeError: 'NoneType' object has no attribute 'view'
Hello @eltonzheng . I am still facing this issue. This is the traceback I'm getting:
Traceback (most recent call last):
File "/cfs/home/u021543/pheye_llavar_accelerate.py", line 68, in <module>
accelerator.backward(loss)
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/accelerate/accelerator.py", line 1958, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/accelerate/utils/deepspeed.py", line 167, in backward
self.engine.backward(loss, **kwargs)
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1955, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2019, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/torch/autograd/__init__.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 865, in reduce_partition_and_remove_grads
self.reduce_ready_partitions_and_remove_grads(param, i)
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1377, in reduce_ready_partitions_and_remove_grads
self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
File "/cfs/home/u021543/miniconda3/lib/python3.11/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 911, in reduce_independent_p_g_buckets_and_remove_grads
new_grad_tensor.copy_(grad_reduc.view(-1))
^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'view'
Like @HUAFOR said, changing to Stage 1 solves it, but I really need Stage 2.
Providing a code example isn't the easiest thing in my case, but I can try to describe what I am doing - I added different sets of LoRA adapters to a model (3 to be exact). This model processes images, so for each example what I am doing is using the same model with different LoRA adapters for the same image at different sizes. This makes it so that the model that processes images at higher resolutions has to use more forward passes, so the backward pass is much more expensive since it has to record multiple gradients.
TLDR: I am doing something that needs to store a lot of gradients for each example, that is why I wanted to use zero stage 2.
File "/root/.vscode-server/extensions/ms-python.debugpy-2025.4.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/pydevd.py", line 3710, in
main()
File "/root/.vscode-server/extensions/ms-python.debugpy-2025.4.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/pydevd.py", line 3695, in main
globals = debugger.run(setup["file"], None, None, is_module)
File "/root/.vscode-server/extensions/ms-python.debugpy-2025.4.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/pydevd.py", line 2691, in run
return self._exec(is_module, entry_point_fn, module_name, file, globals, locals)
File "/root/.vscode-server/extensions/ms-python.debugpy-2025.4.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/pydevd.py", line 2699, in _exec
globals = pydevd_runpy.run_path(file, globals, "main")
File "/root/.vscode-server/extensions/ms-python.debugpy-2025.4.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 310, in run_path
return _run_module_code(code, init_globals, run_name, pkg_name=pkg_name, script_name=fname)
File "/root/.vscode-server/extensions/ms-python.debugpy-2025.4.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 127, in _run_module_code
_run_code(code, mod_globals, init_globals, mod_name, mod_spec, pkg_name, script_name)
File "/root/.vscode-server/extensions/ms-python.debugpy-2025.4.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 118, in _run_code
exec(code, run_globals)
File "/mnt/data/code/videocrafter-training-pytorch-main/train_main.py", line 607, in
trainer.fit(model, data)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 608, in fit
call._call_and_handle_interrupt(
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1112, in _run
results = self._run_stage()
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1191, in _run_stage
self._run_train()
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1214, in _run_train
self.fit_loop.run()
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 267, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 213, in advance
batch_output = self.batch_loop.run(kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(optimizers, kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 202, in advance
result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 249, in _run_optimization
self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 370, in _optimizer_step
self.trainer._call_lightning_module_hook(
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1356, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1742, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 169, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 280, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 234, in optimizer_step
return self.precision_plugin.optimizer_step(
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 132, in optimizer_step
closure_result = closure()
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 149, in call
self._result = self.closure(*args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 144, in closure
self._backward_fn(step_output.closure_loss)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 305, in backward_fn
self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1494, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 207, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, optimizer_idx, *args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 118, in backward
deepspeed_engine.backward(tensor, *args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2126, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2067, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/torch/autograd/init.py", line 266, in backward
Variable.execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 915, in reduce_partition_and_remove_grads
self.reduce_ready_partitions_and_remove_grads(param, i)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1426, in reduce_ready_partitions_and_remove_grads
self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
File "/mnt/data/anaconda3/envs/vctrain/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 962, in reduce_independent_p_g_buckets_and_remove_grads
new_grad_tensor.copy(grad_reduc.view(-1))
AttributeError: 'NoneType' object has no attribute 'view'