My GPU is RTX3090 * 2, memory is 256g, and CPU is Intel (R) Xeon (R) gold 6230 CPU @ 2.10GHz. When I use the docker image glm-cuda112 provided to run GLM to train SuperGLUE-COPA, the following error occurs,
WARNING: could not find the metadata file /root/data/checkpoints/blocklm-base-blank/latest_checkpointed_iteration.txt
Try to directly load the checkpoint from the directory
global rank 0 is loading pretrained model /root/data/checkpoints/blocklm-base-blank/mp_rank_00_model_states.pt
Traceback (most recent call last):
File "finetune_glm.py", line 469, in
main(args)
File "/workspace/GLM-main/tasks/superglue/finetune.py", line 100, in main
finetune(args, train_valid_datasets_provider, model_kwargs,
File "/workspace/GLM-main/finetune_glm.py", line 379, in finetune
load_pretrained(model, args.load_pretrained, args, task_tokens=task_tokens)
File "/workspace/GLM-main/train_utils.py", line 23, in load_pretrained
sd = torch.load(checkpoint_name, map_location='cpu')
File "/opt/conda/lib/python3.8/site-packages/torch/serialization.py", line 593, in load
return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
File "/opt/conda/lib/python3.8/site-packages/torch/serialization.py", line 762, in _legacy_load
magic_number = pickle_module.load(f, **pickle_load_args)
issue.txt
EOFError: Ran out of input
Killing subprocess 28691
Killing subprocess 28692
Traceback (most recent call last):
File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 171, in
main()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 161, in main
sigkill_handler(signal.SIGTERM, None) # not coming back
File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 139, in sigkill_handler
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
subprocess.CalledProcessError: Command '['/opt/conda/bin/python', '-u', 'finetune_glm.py', '--local_rank=1', '--deepspeed', '--deepspeed_config', 'config_tasks/config_blocklm_10B.json', '--finetune', '--cloze-eval', '--experiment-name', 'blank-base-copa_04-17-02-31', '--task', 'COPA', '--data-dir', '/root/data/superglue/COPA', '--save', '/root/data/checkpoints', '--seq-length', '256', '--checkpoint-activations', '--eval-batch-size', '16', '--save-epoch', '100000', '--num-workers', '1', '--no-load-optim', '--no-load-lr-scheduler', '--block-lm', '--num-layers', '12', '--hidden-size', '768', '--num-attention-heads', '12', '--max-position-embeddings', '512', '--tokenizer-model-type', 'bert-base-uncased', '--tokenizer-type', 'BertWordPieceTokenizer', '--load-pretrained', '/root/data/checkpoints/blocklm-base-blank', '--lr-decay-style', 'linear', '--warmup', '0.1', '--weight-decay', '1.0e-1', '--pattern-id', '0', '--save-interval', '10000', '--log-interval', '20', '--eval-interval', '1000', '--eval-iters', '100', '--fp16', '--model-parallel-size', '1', '--continuous-prompt', '--num-prompt-tokens', '3', '--epochs', '100', '--overwrite']' returned non-zero exit status 1