Ask-Anything 第一个step后第三阶段loss变为nan

这个我stage3配置信息

from configs.instruction_data import *
# ========================= data ==========================
train_corpus = "videochat2_instruction"
train_file = "${available_corpus[${train_corpus}]}"  # for lazy evaluation
# import pdb;pdb.set_trace()
# train_file = available_corpus[train_corpus]

test_file = dict()
test_types = []
num_workers = 4

stop_key = None

# ========================= input ==========================
num_frames = 8
num_frames_test = 8
batch_size = 4
max_txt_l = 512

pre_text = False

inputs = dict(
    image_res=224,
    video_input=dict(
        num_frames="${num_frames}",
        sample_type="rand",
        num_frames_test="${num_frames_test}",
        sample_type_test="middle",
        random_aug=False,
    ),
    max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
    batch_size=dict(image="${batch_size}", video="${batch_size}"),
    batch_size_test=dict(image="${batch_size}", video="${batch_size}"),
)

# ========================= model ==========================
model = dict(
    model_cls="VideoChat2_it",
    vit_blip_model_path="./video_chat2/umt_l16_qformer.pth",
    llama_model_path="./video_chat2/vicuna-7b-v0",
    videochat2_model_path="./videochat2_7b_stage3.pth",
    freeze_vit=False,
    freeze_qformer=False,
    max_txt_len="${max_txt_l}", # use large max_txt_len on stage3
    # vit
    low_resource=False,
    add_temp_embed=False,
    vision_encoder=dict(
        name="vit_l14",
        img_size=224, 
        patch_size=16, 
        d_model=1024,
        encoder_embed_dim=1024, 
        encoder_depth=24,
        encoder_num_heads=16, 
        drop_path_rate=0., 
        num_frames="${num_frames}",
        tubelet_size=1,
        use_checkpoint=False,
        checkpoint_num=0,
        pretrained="",
        return_index=-2,
        vit_add_ln=True,
        ckpt_num_frame=4,
    ),
    # qformer
    num_query_token=32,
    qformer_hidden_dropout_prob=0.1,
    qformer_attention_probs_dropout_prob=0.1,
    qformer_drop_path_rate=0.2,
    extra_num_query_token=64,
    qformer_text_input=True,
    # prompt
    system="",
    start_token="<Video>",
    end_token="</Video>",
    add_second_msg=True,
    img_start_token="<Image>", 
    img_end_token="</Image>",
    random_shuffle=True, 
    use_flash_attention=True,
    use_lora=True,
    lora_r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    # debug=True,
)

optimizer = dict(
    opt="adamW",
    # lr=2e-5,
    lr=1e-6,
    opt_betas=[0.9, 0.999],  # default
    weight_decay=0.02,
    # weight_decay=0.00,
    max_grad_norm=-1,  # requires a positive float, use -1 to disable
    # use a different lr for some modules, e.g., larger lr for new modules
    different_lr=dict(enable=False, module_names=[], lr=1e-3),
)

scheduler = dict(sched="cosine", epochs=5, min_lr_multi=0.25, warmup_epochs=0.6)

evaluate = False
deep_fusion = False
evaluation = dict(
    eval_frame_ensemble="concat",  # [concat, max, mean, lse]
    eval_x_only=False,
    k_test=128,
    eval_offload=True,  # offload gpu tensors to cpu to save memory.
)
# fp16 = True
fp16 = False
# bf16 = True
gradient_checkpointing = True

# ========================= wandb ==========================
wandb = dict(
    enable=False,
    entity="yyyin",  # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
    project="videochat2",  # setup in your command line
)
dist_url = "env://"
device = "cuda"
mode = "it"

# ========================= others ==========================
output_dir = './output/stage3'  # output dir
resume = False  # if True, load optimizer and scheduler states as well
debug = False
log_freq = 1
seed = 6666

save_latest = True
auto_resume = False
pretrained_path = ''
# pretrained_path = "/"  # path to pretrained model weights, for resume only?

然后我得到错误，loss为nan

2024-03-26T13:38:04 | utils.basic_utils: Train Epoch: [0]  [     0/141686]  eta: 2 days, 21:16:32  lr: 0.000000  image-loss: 1.0297  time: 1.7602  data: 0.0052  max mem: 22180 res mem: 22500
2024-03-26T13:38:04 | utils.basic_utils: Train Epoch: [0]  [     1/141686]  eta: 1 day, 16:16:06  lr: 0.000000  image-loss: nan  time: 1.0232  data: 0.0027  max mem: 26102 res mem: 26570
2024-03-26T13:38:04 | utils.basic_utils: Train Epoch: [0]  [     2/141686]  eta: 1 day, 5:42:05  lr: 0.000000  image-loss: nan  time: 0.7547  data: 0.0018  max mem: 26102 res mem: 26572
2024-03-26T13:38:05 | utils.basic_utils: Train Epoch: [0]  [     3/141686]  eta: 1 day, 0:26:46  lr: 0.000000  image-loss: nan  time: 0.6211  data: 0.0014  max mem: 26102 res mem: 26574
2024-03-26T13:38:05 | utils.basic_utils: Train Epoch: [0]  [     4/141686]  eta: 21:19:56  lr: 0.000000  image-loss: nan  time: 0.5420  data: 0.0016  max mem: 26289 res mem: 26590

Mar 26 '24 13:03 YYY-MMW

此外如果我把fp16设置为True会得到

Traceback (most recent call last):
  File "tasks/train_it.py", line 213, in <module>
    main(cfg)
  File "tasks/train_it.py", line 161, in main
    global_step = train(
  File "tasks/train_it.py", line 67, in train
    scaler.step(optimizer)
  File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 446, in step
    self.unscale_(optimizer)
  File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 336, in unscale_
    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
  File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 258, in _unscale_grads_
    raise ValueError("Attempting to unscale FP16 gradients.")
ValueError: Attempting to unscale FP16 gradients.

Mar 26 '24 13:03 YYY-MMW

可以试试把语言模型设置成bf16，混合精度数据类型也改成bf16，fp16在某些情况容易nan

Mar 27 '24 03:03 Andy1621

此外如果我把fp16设置为True会得到

Traceback (most recent call last):
  File "tasks/train_it.py", line 213, in <module>
    main(cfg)
  File "tasks/train_it.py", line 161, in main
    global_step = train(
  File "tasks/train_it.py", line 67, in train
    scaler.step(optimizer)
  File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 446, in step
    self.unscale_(optimizer)
  File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 336, in unscale_
    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
  File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 258, in _unscale_grads_
    raise ValueError("Attempting to unscale FP16 gradients.")
ValueError: Attempting to unscale FP16 gradients.

我也有一样的问题，你解决了吗

May 17 '24 09:05 qyr0403

可以试试把语言模型设置成bf16，混合精度数据类型也改成bf16，fp16在某些情况容易nan

According to the author's message, after changing to bf16 the loss will no longer be nan. torch_dtype=torch.bfloat16

Jun 04 '24 02:06 NIneeeeeem