Ask-Anything
Ask-Anything copied to clipboard
第一个step后第三阶段loss变为nan
这个我stage3配置信息
from configs.instruction_data import *
# ========================= data ==========================
train_corpus = "videochat2_instruction"
train_file = "${available_corpus[${train_corpus}]}" # for lazy evaluation
# import pdb;pdb.set_trace()
# train_file = available_corpus[train_corpus]
test_file = dict()
test_types = []
num_workers = 4
stop_key = None
# ========================= input ==========================
num_frames = 8
num_frames_test = 8
batch_size = 4
max_txt_l = 512
pre_text = False
inputs = dict(
image_res=224,
video_input=dict(
num_frames="${num_frames}",
sample_type="rand",
num_frames_test="${num_frames_test}",
sample_type_test="middle",
random_aug=False,
),
max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
batch_size=dict(image="${batch_size}", video="${batch_size}"),
batch_size_test=dict(image="${batch_size}", video="${batch_size}"),
)
# ========================= model ==========================
model = dict(
model_cls="VideoChat2_it",
vit_blip_model_path="./video_chat2/umt_l16_qformer.pth",
llama_model_path="./video_chat2/vicuna-7b-v0",
videochat2_model_path="./videochat2_7b_stage3.pth",
freeze_vit=False,
freeze_qformer=False,
max_txt_len="${max_txt_l}", # use large max_txt_len on stage3
# vit
low_resource=False,
add_temp_embed=False,
vision_encoder=dict(
name="vit_l14",
img_size=224,
patch_size=16,
d_model=1024,
encoder_embed_dim=1024,
encoder_depth=24,
encoder_num_heads=16,
drop_path_rate=0.,
num_frames="${num_frames}",
tubelet_size=1,
use_checkpoint=False,
checkpoint_num=0,
pretrained="",
return_index=-2,
vit_add_ln=True,
ckpt_num_frame=4,
),
# qformer
num_query_token=32,
qformer_hidden_dropout_prob=0.1,
qformer_attention_probs_dropout_prob=0.1,
qformer_drop_path_rate=0.2,
extra_num_query_token=64,
qformer_text_input=True,
# prompt
system="",
start_token="<Video>",
end_token="</Video>",
add_second_msg=True,
img_start_token="<Image>",
img_end_token="</Image>",
random_shuffle=True,
use_flash_attention=True,
use_lora=True,
lora_r=16,
lora_alpha=32,
lora_dropout=0.1,
# debug=True,
)
optimizer = dict(
opt="adamW",
# lr=2e-5,
lr=1e-6,
opt_betas=[0.9, 0.999], # default
weight_decay=0.02,
# weight_decay=0.00,
max_grad_norm=-1, # requires a positive float, use -1 to disable
# use a different lr for some modules, e.g., larger lr for new modules
different_lr=dict(enable=False, module_names=[], lr=1e-3),
)
scheduler = dict(sched="cosine", epochs=5, min_lr_multi=0.25, warmup_epochs=0.6)
evaluate = False
deep_fusion = False
evaluation = dict(
eval_frame_ensemble="concat", # [concat, max, mean, lse]
eval_x_only=False,
k_test=128,
eval_offload=True, # offload gpu tensors to cpu to save memory.
)
# fp16 = True
fp16 = False
# bf16 = True
gradient_checkpointing = True
# ========================= wandb ==========================
wandb = dict(
enable=False,
entity="yyyin", # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
project="videochat2", # setup in your command line
)
dist_url = "env://"
device = "cuda"
mode = "it"
# ========================= others ==========================
output_dir = './output/stage3' # output dir
resume = False # if True, load optimizer and scheduler states as well
debug = False
log_freq = 1
seed = 6666
save_latest = True
auto_resume = False
pretrained_path = ''
# pretrained_path = "/" # path to pretrained model weights, for resume only?
然后我得到错误,loss为nan
2024-03-26T13:38:04 | utils.basic_utils: Train Epoch: [0] [ 0/141686] eta: 2 days, 21:16:32 lr: 0.000000 image-loss: 1.0297 time: 1.7602 data: 0.0052 max mem: 22180 res mem: 22500
2024-03-26T13:38:04 | utils.basic_utils: Train Epoch: [0] [ 1/141686] eta: 1 day, 16:16:06 lr: 0.000000 image-loss: nan time: 1.0232 data: 0.0027 max mem: 26102 res mem: 26570
2024-03-26T13:38:04 | utils.basic_utils: Train Epoch: [0] [ 2/141686] eta: 1 day, 5:42:05 lr: 0.000000 image-loss: nan time: 0.7547 data: 0.0018 max mem: 26102 res mem: 26572
2024-03-26T13:38:05 | utils.basic_utils: Train Epoch: [0] [ 3/141686] eta: 1 day, 0:26:46 lr: 0.000000 image-loss: nan time: 0.6211 data: 0.0014 max mem: 26102 res mem: 26574
2024-03-26T13:38:05 | utils.basic_utils: Train Epoch: [0] [ 4/141686] eta: 21:19:56 lr: 0.000000 image-loss: nan time: 0.5420 data: 0.0016 max mem: 26289 res mem: 26590
此外如果我把fp16设置为True会得到
Traceback (most recent call last):
File "tasks/train_it.py", line 213, in <module>
main(cfg)
File "tasks/train_it.py", line 161, in main
global_step = train(
File "tasks/train_it.py", line 67, in train
scaler.step(optimizer)
File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 446, in step
self.unscale_(optimizer)
File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 336, in unscale_
optimizer_state["found_inf_per_device"] = self._unscale_grads_(
File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 258, in _unscale_grads_
raise ValueError("Attempting to unscale FP16 gradients.")
ValueError: Attempting to unscale FP16 gradients.
可以试试把语言模型设置成bf16,混合精度数据类型也改成bf16,fp16在某些情况容易nan
此外如果我把fp16设置为True会得到
Traceback (most recent call last): File "tasks/train_it.py", line 213, in <module> main(cfg) File "tasks/train_it.py", line 161, in main global_step = train( File "tasks/train_it.py", line 67, in train scaler.step(optimizer) File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 446, in step self.unscale_(optimizer) File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 336, in unscale_ optimizer_state["found_inf_per_device"] = self._unscale_grads_( File "/usr/local/lib/python3.8/dist-packages/torch/cuda/amp/grad_scaler.py", line 258, in _unscale_grads_ raise ValueError("Attempting to unscale FP16 gradients.") ValueError: Attempting to unscale FP16 gradients.
我也有一样的问题,你解决了吗
可以试试把语言模型设置成bf16,混合精度数据类型也改成bf16,fp16在某些情况容易nan
According to the author's message, after changing to bf16 the loss will no longer be nan.
torch_dtype=torch.bfloat16