🐛 Describe the bug
Hi I am using docker to run ColossalAI/examples/images/diffusion/ and tried https://hub.docker.com/r/hpcaitech/colossalai/tags 0.2.5, 0.2.4 0.2.2 0.1.10-torch1.11-cu11.3 with train_colossalai_teyvat.yaml and 512-base-ema.ckpt. However, there is an error:
Epoch 0: 44%|▍| 103/234 [01:07<01:25, 1.53it/s, loss=0.401, v_num=0, train/loss_simple_step=0.350, train/losEpoch 0: 100%|█| 234/234 [02:30<00:00, 1.56it/s, loss=0.322, v_num=0, train/loss_simple_step=0.752, train/los/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:536: PossibleUserWarning: It is recommended to use self.log('train/loss_simple', ..., sync_dist=True) when logging on epoch level in distributed setting to accumulate the metric across devices.
warning_cache.warn(
/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:536: PossibleUserWarning: It is recommended to use self.log('train/loss_vlb', ..., sync_dist=True) when logging on epoch level in distributed setting to accumulate the metric across devices.
warning_cache.warn(
/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:536: PossibleUserWarning: It is recommended to use self.log('train/loss', ..., sync_dist=True) when logging on epoch level in distributed setting to accumulate the metric across devices.
warning_cache.warn(
Epoch 0: 100%|█| 234/234 [02:30<00:00, 1.56it/s, loss=0.322, v_num=0, train/loss_simple_step=0.752, train/losSummoning checkpoint.
Traceback (most recent call last):
File "/opt/ml/main.py", line 810, in
trainer.fit(model, data)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 608, in fit
call._call_and_handle_interrupt(
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 36, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 88, in launch
return function(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 650, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1112, in _run
results = self._run_stage()
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1191, in _run_stage
self._run_train()
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1214, in _run_train
self.fit_loop.run()
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 200, in run
self.on_advance_end()
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 295, in on_advance_end
self.trainer._call_callback_hooks("on_train_epoch_end")
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1394, in _call_callback_hooks
fn(self, self.lightning_module, *args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 304, in on_train_epoch_end
self._save_topk_checkpoint(trainer, monitor_candidates)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 363, in _save_topk_checkpoint
self._save_none_monitor_checkpoint(trainer, monitor_candidates)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 669, in _save_none_monitor_checkpoint
self._save_checkpoint(trainer, filepath)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 366, in _save_checkpoint
trainer.save_checkpoint(filepath, self.save_weights_only)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1939, in save_checkpoint
self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 510, in save_checkpoint
_checkpoint = self.dump_checkpoint(weights_only)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 442, in dump_checkpoint
"state_dict": self._get_lightning_module_state_dict(),
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 524, in _get_lightning_module_state_dict
state_dict = self.trainer.strategy.lightning_module_state_dict()
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/strategies/colossalai.py", line 399, in lightning_module_state_dict
org_dict = self.model.state_dict(only_rank_0=rank_zero_only)
File "/opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py", line 352, in state_dict
torch_model = get_static_torch_model(zero_ddp_model=self, only_rank_0=only_rank_0)
File "/opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/utils.py", line 101, in get_static_torch_model
assert param in colo_to_torch, f"can not find parameter {full_param_name} in the GeminiDDP module"
AssertionError: can not find parameter _forward_module.first_stage_model.encoder.conv_in.weight in the GeminiDDP module
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/ml/main.py", line 812, in
melk()
File "/opt/ml/main.py", line 795, in melk
trainer.save_checkpoint(ckpt_path)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1939, in save_checkpoint
self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 510, in save_checkpoint
_checkpoint = self.dump_checkpoint(weights_only)
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 442, in dump_checkpoint
"state_dict": self._get_lightning_module_state_dict(),
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 524, in _get_lightning_module_state_dict
state_dict = self.trainer.strategy.lightning_module_state_dict()
File "/opt/conda/lib/python3.9/site-packages/lightning/pytorch/strategies/colossalai.py", line 399, in lightning_module_state_dict
org_dict = self.model.state_dict(only_rank_0=rank_zero_only)
File "/opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py", line 352, in state_dict
torch_model = get_static_torch_model(zero_ddp_model=self, only_rank_0=only_rank_0)
File "/opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/utils.py", line 101, in get_static_torch_model
assert param in colo_to_torch, f"can not find parameter {full_param_name} in the GeminiDDP module"
AssertionError: can not find parameter _forward_module.first_stage_model.encoder.conv_in.weight in the GeminiDDP module
Environment
NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6
The problem is fixed by: https://github.com/hpcaitech/ColossalAI/pull/2443, but you have to build ColossalAI from source, since the commit is not included in latest 0.2.5.