I am running into the below issue when I train VideoScore:
Training model...
Parameter Offload: Total persistent parameters: 706800 in 348 params
0%| | 0/576 [00:00<?, ?it/s]use_cache=True is incompatible with gradient checkpointing. Setting use_cache=False...
[rank0]: Traceback (most recent call last):
[rank0]: File "/aia-cv/root/aia-cv/lgq/VideoScore/Mantis/mantis/train/train_idefics2.py", line 258, in
[rank0]: main(training_args, data_args, model_args)
[rank0]: File "/aia-cv/root/aia-cv/lgq/VideoScore/Mantis/mantis/train/train_idefics2.py", line 232, in main
[rank0]: trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/transformers/trainer.py", line 2052, in train
[rank0]: return inner_training_loop(
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop
[rank0]: tr_loss_step = self.training_step(model, inputs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/transformers/trainer.py", line 3485, in training_step
[rank0]: loss = self.compute_loss(model, inputs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/transformers/trainer.py", line 3532, in compute_loss
[rank0]: outputs = model(**inputs)
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
[rank0]: ret_val = func(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1899, in forward
[rank0]: loss = self.module(*inputs, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
[rank0]: return inner()
[rank0]: ^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1793, in inner
[rank0]: result = forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/mantis/models/idefics2/modeling_idefics2.py", line 1868, in forward
[rank0]: outputs = self.model(
[rank0]: ^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
[rank0]: return inner()
[rank0]: ^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1793, in inner
[rank0]: result = forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/mantis/models/idefics2/modeling_idefics2.py", line 1688, in forward
[rank0]: image_hidden_states = self.connector(
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
[rank0]: return inner()
[rank0]: ^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1793, in inner
[rank0]: result = forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/mantis/models/idefics2/modeling_idefics2.py", line 1334, in forward
[rank0]: image_hidden_states = self.perceiver_resampler(context=image_hidden_states, attention_mask=attention_mask)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
[rank0]: return inner()
[rank0]: ^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1793, in inner
[rank0]: result = forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/mantis/models/idefics2/modeling_idefics2.py", line 1304, in forward
[rank0]: layer_outputs = perceiver_layer(
[rank0]: ^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
[rank0]: return inner()
[rank0]: ^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1793, in inner
[rank0]: result = forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/mantis/models/idefics2/modeling_idefics2.py", line 1237, in forward
[rank0]: latents, self_attn_weights, present_key_value = self.self_attn(
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
[rank0]: return inner()
[rank0]: ^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1793, in inner
[rank0]: result = forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/mantis/models/idefics2/modeling_idefics2.py", line 1020, in forward
[rank0]: attn_output = self._flash_attention_forward(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/mantis/models/idefics2/modeling_idefics2.py", line 1079, in _flash_attention_forward
[rank0]: query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
[rank0]: ^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/envs/videoscore_lgq/lib/python3.11/site-packages/mantis/models/idefics2/modeling_idefics2.py", line 1169, in _upad_input
[rank0]: query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: ValueError: too many values to unpack (expected 4)
did you solve this?
Nope, nobody helps.
I think I solved this by transformers==4.47.1