If set: accumulation-steps > 1 and amp_status = '01' , there comes an error:
norm 0.2449 (0.2449) loss_scale 65536.0000 (65536.0000) mem 3750MB
Traceback (most recent call last):
File "main_simmim_pt.py", line 235, in
main(config)
File "main_simmim_pt.py", line 106, in main
train_one_epoch(config, model, data_loader_train, optimizer, epoch, lr_scheduler, scaler)
File "main_simmim_pt.py", line 138, in train_one_epoch
scaler.unscale(optimizer)
File "/miniconda/envs/py37/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py", line 270, in unscale_
raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
RuntimeError: unscale_() has already been called on this optimizer since the last update().
Traceback (most recent call last):
File "main_simmim_pt.py", line 235, in
main(config)
File "main_simmim_pt.py", line 106, in main
train_one_epoch(config, model, data_loader_train, optimizer, epoch, lr_scheduler, scaler)
File "main_simmim_pt.py", line 138, in train_one_epoch
scaler.unscale_(optimizer)
File "/miniconda/envs/py37/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py", line 270, in unscale_
raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
RuntimeError: unscale_() has already been called on this optimizer since the last update().
Traceback (most recent call last):
File "main_simmim_pt.py", line 235, in
main(config)
File "main_simmim_pt.py", line 106, in main
train_one_epoch(config, model, data_loader_train, optimizer, epoch, lr_scheduler, scaler)
File "main_simmim_pt.py", line 138, in train_one_epoch
scaler.unscale_(optimizer)
File "/miniconda/envs/py37/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py", line 270, in unscale_
raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
RuntimeError: unscale_() has already been called on this optimizer since the last update().
Traceback (most recent call last):
File "main_simmim_pt.py", line 235, in
main(config)
File "main_simmim_pt.py", line 106, in main
train_one_epoch(config, model, data_loader_train, optimizer, epoch, lr_scheduler, scaler)
File "main_simmim_pt.py", line 138, in train_one_epoch
scaler.unscale_(optimizer)
File "/miniconda/envs/py37/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py", line 270, in unscale_
raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
RuntimeError: unscale_() has already been called on this optimizer since the last update().
Traceback (most recent call last):
File "main_simmim_pt.py", line 235, in
main(config)
File "main_simmim_pt.py", line 106, in main
train_one_epoch(config, model, data_loader_train, optimizer, epoch, lr_scheduler, scaler)
File "main_simmim_pt.py", line 138, in train_one_epoch
scaler.unscale_(optimizer)
File "/miniconda/envs/py37/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py", line 270, in unscale_
raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
RuntimeError: unscale_() has already been called on this optimizer since the last update().
Traceback (most recent call last):
File "main_simmim_pt.py", line 235, in
main(config)
File "main_simmim_pt.py", line 106, in main
train_one_epoch(config, model, data_loader_train, optimizer, epoch, lr_scheduler, scaler)
File "main_simmim_pt.py", line 138, in train_one_epoch
scaler.unscale_(optimizer)
File "/miniconda/envs/py37/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py", line 270, in unscale_
raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
RuntimeError: unscale_() has already been called on this optimizer since the last update().
Traceback (most recent call last):
File "main_simmim_pt.py", line 235, in
main(config)
File "main_simmim_pt.py", line 106, in main
train_one_epoch(config, model, data_loader_train, optimizer, epoch, lr_scheduler, scaler)
File "main_simmim_pt.py", line 138, in train_one_epoch
scaler.unscale_(optimizer)
File "/miniconda/envs/py37/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py", line 270, in unscale_
raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
RuntimeError: unscale_() has already been called on this optimizer since the last update().
Traceback (most recent call last):
File "main_simmim_pt.py", line 235, in
main(config)
File "main_simmim_pt.py", line 106, in main
train_one_epoch(config, model, data_loader_train, optimizer, epoch, lr_scheduler, scaler)
File "main_simmim_pt.py", line 138, in train_one_epoch
scaler.unscale_(optimizer)
File "/miniconda/envs/py37/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py", line 270, in unscale_
raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
RuntimeError: unscale_() has already been called on this optimizer since the last update().
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 74) of binary: /miniconda/envs/py37/bin/python