RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0!

Open QiuLL opened this issue 3 years ago • 0 comments

🐛 Describe the bug

my command is below, and i tried --placement="cpu",or --placement="auto" get the same error. because my gpu only 10G, when i tried --placement="gpu",get cuda out of memory. torchrun --nproc_per_node 1 /ColossalAI/examples/images/dreambooth/train_dreambooth_colossalai.py --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4" --instance_data_dir="/ColossalAI/pokemon/img" --output_dir="/ColossalAI/outputs" --instance_prompt="A pokemon with green eyes, large wings, and a hat" --resolution=512 --train_batch_size=1 --learning_rate=5e-6 --lr_scheduler="constant" --lr_warmup_steps=0 --max_train_steps=10 --placement="auto" --save_steps 5

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /data/qll/ColossalAI_2/ColossalAI/examples/images/dreambooth/train_dreambooth_colossalai.py:691 │ │ in │ │ │ │ 688 │ │ 689 if name == "main": │ │ 690 │ args = parse_args() │ │ ❱ 691 │ main(args) │ │ 692 │ │ │ │ /data/qll/ColossalAI_2/ColossalAI/examples/images/dreambooth/train_dreambooth_colossalai.py:618 │ │ in main │ │ │ │ 615 │ │ │ encoder_hidden_states = text_encoder(batch["input_ids"])[0] │ │ 616 │ │ │ │ │ 617 │ │ │ # Predict the noise residual │ │ ❱ 618 │ │ │ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample │ │ 619 │ │ │ │ │ 620 │ │ │ # Get the target for loss depending on the prediction type │ │ 621 │ │ │ if noise_scheduler.config.prediction_type == "epsilon": │ │ │ │ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1130 │ │ in _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/colossalai/nn/parallel/data_para │ │ llel.py:274 in forward │ │ │ │ 271 │ │ self.module.zero_grad(set_to_none=True) │ │ 272 │ │ self.gemini_manager.pre_iter(*args) │ │ 273 │ │ with ColoParamOpHookManager.use_hooks(self.param_op_hook): │ │ ❱ 274 │ │ │ outputs = self.module(*args, **kwargs) │ │ 275 │ │ if self.force_outputs_fp32: │ │ 276 │ │ │ return _cast_float(outputs, torch.float) │ │ 277 │ │ return outputs │ │ │ │ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1130 │ │ in _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/diffusers/models/unet_2d_conditi │ │ on.py:405 in forward │ │ │ │ 402 │ │ # but time_embedding might actually be running in fp16. so we need to cast here. │ │ 403 │ │ # there might be better ways to encapsulate this. │ │ 404 │ │ t_emb = t_emb.to(dtype=self.dtype) │ │ ❱ 405 │ │ emb = self.time_embedding(t_emb) │ │ 406 │ │ │ │ 407 │ │ if self.class_embedding is not None: │ │ 408 │ │ │ if class_labels is None: │ │ │ │ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1130 │ │ in _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/diffusers/models/embeddings.py:8 │ │ 2 in forward │ │ │ │ 79 │ │ self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out) │ │ 80 │ │ │ 81 │ def forward(self, sample): │ │ ❱ 82 │ │ sample = self.linear_1(sample) │ │ 83 │ │ │ │ 84 │ │ if self.act is not None: │ │ 85 │ │ │ sample = self.act(sample) │ │ │ │ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1130 │ │ in _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or global_backward_hooks: │ │ │ │ /home/qll/anaconda3/envs/diffusion/lib/python3.10/site-packages/torch/nn/modules/linear.py:114 │ │ in forward │ │ │ │ 111 │ │ │ init.uniform(self.bias, -bound, bound) │ │ 112 │ │ │ 113 │ def forward(self, input: Tensor) -> Tensor: │ │ ❱ 114 │ │ return F.linear(input, self.weight, self.bias) │ │ 115 │ │ │ 116 │ def extra_repr(self) -> str: │ │ 117 │ │ return 'in_features={}, out_features={}, bias={}'.format( │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)

Environment

NVIDIA-SMI 470.161.03 Driver Version: 470.161.03 CUDA Version: 11.4 torch:torch-1.12.1+cu113-cp310-cp310-linux_x86_64 torch.version.cuda:'11.3'

Jan 10 '23 12:01 QiuLL