🐛 Describe the bug
my command is below, and i tried --placement="cpu",or --placement="auto" get the same error. because my gpu only 10G, when i tried --placement="gpu",get cuda out of memory.
torchrun --nproc_per_node 1 /ColossalAI/examples/images/dreambooth/train_dreambooth_colossalai.py --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4" --instance_data_dir="/ColossalAI/pokemon/img" --output_dir="/ColossalAI/outputs" --instance_prompt="A pokemon with green eyes, large wings, and a hat" --resolution=512 --train_batch_size=1 --learning_rate=5e-6 --lr_scheduler="constant" --lr_warmup_steps=0 --max_train_steps=10 --placement="auto" --save_steps 5
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /data/qll/ColossalAI_2/ColossalAI/examples/images/dreambooth/train_dreambooth_colossalai.py:691 │
│ in │
│ │
│ 688 │
│ 689 if name == "main": │
│ 690 │ args = parse_args() │
│ ❱ 691 │ main(args) │
│ 692 │
│ │
│ /data/qll/ColossalAI_2/ColossalAI/examples/images/dreambooth/train_dreambooth_colossalai.py:618 │
│ in main │
│ │
│ 615 │ │ │ encoder_hidden_states = text_encoder(batch["input_ids"])[0] │
│ 616 │ │ │ │
│ 617 │ │ │ # Predict the noise residual │
│ ❱ 618 │ │ │ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample │
│ 619 │ │ │ │
│ 620 │ │ │ # Get the target for loss depending on the prediction type │
│ 621 │ │ │ if noise_scheduler.config.prediction_type == "epsilon": │
│ │
│ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1130 │
│ in _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/colossalai/nn/parallel/data_para │
│ llel.py:274 in forward │
│ │
│ 271 │ │ self.module.zero_grad(set_to_none=True) │
│ 272 │ │ self.gemini_manager.pre_iter(*args) │
│ 273 │ │ with ColoParamOpHookManager.use_hooks(self.param_op_hook): │
│ ❱ 274 │ │ │ outputs = self.module(*args, **kwargs) │
│ 275 │ │ if self.force_outputs_fp32: │
│ 276 │ │ │ return _cast_float(outputs, torch.float) │
│ 277 │ │ return outputs │
│ │
│ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1130 │
│ in _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/diffusers/models/unet_2d_conditi │
│ on.py:405 in forward │
│ │
│ 402 │ │ # but time_embedding might actually be running in fp16. so we need to cast here. │
│ 403 │ │ # there might be better ways to encapsulate this. │
│ 404 │ │ t_emb = t_emb.to(dtype=self.dtype) │
│ ❱ 405 │ │ emb = self.time_embedding(t_emb) │
│ 406 │ │ │
│ 407 │ │ if self.class_embedding is not None: │
│ 408 │ │ │ if class_labels is None: │
│ │
│ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1130 │
│ in _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/diffusers/models/embeddings.py:8 │
│ 2 in forward │
│ │
│ 79 │ │ self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out) │
│ 80 │ │
│ 81 │ def forward(self, sample): │
│ ❱ 82 │ │ sample = self.linear_1(sample) │
│ 83 │ │ │
│ 84 │ │ if self.act is not None: │
│ 85 │ │ │ sample = self.act(sample) │
│ │
│ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1130 │
│ in _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or global_backward_hooks: │
│ │
│ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/torch/nn/modules/linear.py:114 │
│ in forward │
│ │
│ 111 │ │ │ init.uniform(self.bias, -bound, bound) │
│ 112 │ │
│ 113 │ def forward(self, input: Tensor) -> Tensor: │
│ ❱ 114 │ │ return F.linear(input, self.weight, self.bias) │
│ 115 │ │
│ 116 │ def extra_repr(self) -> str: │
│ 117 │ │ return 'in_features={}, out_features={}, bias={}'.format( │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)
Environment
NVIDIA-SMI 470.161.03 Driver Version: 470.161.03 CUDA Version: 11.4
torch:torch-1.12.1+cu113-cp310-cp310-linux_x86_64
torch.version.cuda:'11.3'