RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasGemmStridedBatchedExFix...`
Describe the bug
The error was thrown when executing the following codes on SD V2 inpainting . It works fine on SD v1. My cuda and torch version are listed below.
RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasGemmStridedBatchedExFix( handle, opa, opb, m, n, k, (void*)(&falpha), a, CUDA_R_16F, lda, stridea, b, CUDA_R_16F, ldb, strideb, (void*)(&fbeta), c, CUDA_R_16F, ldc, stridec, num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`
Reproduction
device = "cuda"
model_path = 'stabilityai/stable-diffusion-2-inpainting'
pipe = StableDiffusionInpaintPipeline.from_pretrained(
model_path,
revision="fp16",
torch_dtype=torch.float16,
use_auth_token=True
).to(device)
images = pipe(
prompt=prompt,
image=image,
mask_image=mask_image,
guidance_scale=guidance_scale,
generator=generator,
num_images_per_prompt=num_samples,
).images[0]
Logs
RuntimeError Traceback (most recent call last)
Input In [4], in <cell line: 4>()
1 prompt = "A portrait of ivanka trump"
2 num_images = 4
----> 4 images = pipe(
5 prompt,
6 num_images_per_prompt=num_images,
7 guidance_scale=9,
8 num_inference_steps=25,
9 height=image_length,
10 width=image_length,
11 ).images
13 media.show_images(images)
14 images[0].save("output.jpg")
File ~/.local/lib/python3.9/site-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs)
24 @functools.wraps(func)
25 def decorate_context(*args, **kwargs):
26 with self.clone():
---> 27 return func(*args, **kwargs)
File ~/.conda/envs/dui/lib/python3.9/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py:518, in StableDiffusionPipeline.__call__(self, prompt, height, width, num_inference_steps, guidance_scale, negative_prompt, num_images_per_prompt, eta, generator, latents, output_type, return_dict, callback, callback_steps)
515 do_classifier_free_guidance = guidance_scale > 1.0
517 # 3. Encode input prompt
--> 518 text_embeddings = self._encode_prompt(
519 prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
520 )
522 # 4. Prepare timesteps
523 self.scheduler.set_timesteps(num_inference_steps, device=device)
File ~/.conda/envs/dui/lib/python3.9/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py:299, in StableDiffusionPipeline._encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt)
296 else:
297 attention_mask = None
--> 299 text_embeddings = self.text_encoder(
300 text_input_ids.to(device),
301 attention_mask=attention_mask,
302 )
303 text_embeddings = text_embeddings[0]
305 # duplicate text embeddings for each generation per prompt, using mps friendly method
File ~/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1190, in Module._call_impl(self, *input, **kwargs)
1186 # If we don't have any hooks, we want to skip the rest of the logic in
1187 # this function, and just call forward.
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
File ~/.conda/envs/dui/lib/python3.9/site-packages/transformers/models/clip/modeling_clip.py:733, in CLIPTextModel.forward(self, input_ids, attention_mask, position_ids, output_attentions, output_hidden_states, return_dict)
705 @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
706 @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
707 def forward(
(...)
714 return_dict: Optional[bool] = None,
715 ) -> Union[Tuple, BaseModelOutputWithPooling]:
716 r"""
717 Returns:
718
(...)
731 >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
732 ```"""
--> 733 return self.text_model(
734 input_ids=input_ids,
735 attention_mask=attention_mask,
736 position_ids=position_ids,
737 output_attentions=output_attentions,
738 output_hidden_states=output_hidden_states,
739 return_dict=return_dict,
740 )
File ~/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1190, in Module._call_impl(self, *input, **kwargs)
1186 # If we don't have any hooks, we want to skip the rest of the logic in
1187 # this function, and just call forward.
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
File ~/.conda/envs/dui/lib/python3.9/site-packages/transformers/models/clip/modeling_clip.py:649, in CLIPTextTransformer.forward(self, input_ids, attention_mask, position_ids, output_attentions, output_hidden_states, return_dict)
645 if attention_mask is not None:
646 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
647 attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
--> 649 encoder_outputs = self.encoder(
650 inputs_embeds=hidden_states,
651 attention_mask=attention_mask,
652 causal_attention_mask=causal_attention_mask,
653 output_attentions=output_attentions,
654 output_hidden_states=output_hidden_states,
655 return_dict=return_dict,
656 )
658 last_hidden_state = encoder_outputs[0]
659 last_hidden_state = self.final_layer_norm(last_hidden_state)
File ~/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1190, in Module._call_impl(self, *input, **kwargs)
1186 # If we don't have any hooks, we want to skip the rest of the logic in
1187 # this function, and just call forward.
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
File ~/.conda/envs/dui/lib/python3.9/site-packages/transformers/models/clip/modeling_clip.py:578, in CLIPEncoder.forward(self, inputs_embeds, attention_mask, causal_attention_mask, output_attentions, output_hidden_states, return_dict)
571 layer_outputs = torch.utils.checkpoint.checkpoint(
572 create_custom_forward(encoder_layer),
573 hidden_states,
574 attention_mask,
575 causal_attention_mask,
576 )
577 else:
--> 578 layer_outputs = encoder_layer(
579 hidden_states,
580 attention_mask,
581 causal_attention_mask,
582 output_attentions=output_attentions,
583 )
585 hidden_states = layer_outputs[0]
587 if output_attentions:
File ~/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1190, in Module._call_impl(self, *input, **kwargs)
1186 # If we don't have any hooks, we want to skip the rest of the logic in
1187 # this function, and just call forward.
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
File ~/.conda/envs/dui/lib/python3.9/site-packages/transformers/models/clip/modeling_clip.py:321, in CLIPEncoderLayer.forward(self, hidden_states, attention_mask, causal_attention_mask, output_attentions)
318 residual = hidden_states
320 hidden_states = self.layer_norm1(hidden_states)
--> 321 hidden_states, attn_weights = self.self_attn(
322 hidden_states=hidden_states,
323 attention_mask=attention_mask,
324 causal_attention_mask=causal_attention_mask,
325 output_attentions=output_attentions,
326 )
327 hidden_states = residual + hidden_states
329 residual = hidden_states
File ~/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1190, in Module._call_impl(self, *input, **kwargs)
1186 # If we don't have any hooks, we want to skip the rest of the logic in
1187 # this function, and just call forward.
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
File ~/.conda/envs/dui/lib/python3.9/site-packages/transformers/models/clip/modeling_clip.py:220, in CLIPAttention.forward(self, hidden_states, attention_mask, causal_attention_mask, output_attentions)
217 value_states = value_states.view(*proj_shape)
219 src_len = key_states.size(1)
--> 220 attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
222 if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
223 raise ValueError(
224 f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
225 f" {attn_weights.size()}"
226 )
RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasGemmStridedBatchedExFix( handle, opa, opb, m, n, k, (void*)(&falpha), a, CUDA_R_16F, lda, stridea, b, CUDA_R_16F, ldb, strideb, (void*)(&fbeta), c, CUDA_R_16F, ldc, stridec, num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`
### System Info
$ nvcc --version nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2021 NVIDIA Corporation Built on Thu_Nov_18_09:45:30_PST_2021 Cuda compilation tools, release 11.5, V11.5.119 Build cuda_11.5.r11.5/compiler.30672275_0
$ pip freeze| grep torch pytorch-lightning==1.8.1 torch==1.13.0 torch-model-archiver @ file:///home/ubuntu/anaconda3/envs/py38/conda-bld/torch-model-archiver_1645846597533/work torch-workflow-archiver @ file:///home/ubuntu/anaconda3/envs/py38/conda-bld/torch-workflow-archiver_1652462579732/work torchaudio==0.13.0 torchmetrics==0.10.2 torchserve @ file:///home/ubuntu/anaconda3/envs/py38/conda-bld/torchserve_1652462046145/work torchtext==0.12.0 torchvision==0.14.0