[QUESTION] Megatron-LM installation with CUDA 11.6
Hi,
I am using single-node multi-GPU cluster (A100 * 6) and would like to use Megatron-LM to learn llama2 model on that. On the computational environment, I am not able to use docker and the CUDA version is fixed at 11.6.
My question is if it is possible to install Megatron-LM on my environment.
After installing required packages (including apex), I did
pip install git+https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
pip install .
and run a shell script basically just calling pretrain_gpt.py. It fails with the following error message saying "no module named transformer_engine".
~/pretrain/Megatron-LM$ sh examples/pretrain_llama2.sh
+ DATASET_1=./dataset/arxiv_train_text_sentence
+ DATASET=1 ./dataset/arxiv_train_text_sentence
+ CHECKPOINT_PATH=./outputs/llama2-1.3b/checkpoints/
+ TOKENIZER_PATH=./tokenizer/code10k_en20k_ja30k.ver2.1.model
+ TP=1
+ PP=2
+ GPUS_PER_NODE=2
+ MASTER_ADDR=localhost
+ MASTER_PORT=6000
+ NNODES=1
+ NODE_RANK=0
+ HIDDEN_SIZE=2048
+ FFN_HIDDEN_SIZE=5504
+ NUM_LAYERS=24
+ NUM_HEADS=16
+ SEQ_LENGTH=2048
+ NUM_KV_HEADS=4
+ MICRO_BATCH_SIZE=4
+ GLOBAL_BATCH_SIZE=32
+ TRAIN_STEPS=2500
+ LR=3e-4
+ MIN_LR=3e-5
+ LR_WARMUP_STEPS=20
+ WEIGHT_DECAY=0.1
+ GRAD_CLIP=1
+ activation_checkpoint=false
+ DISTRIBUTED_ARGS=--nproc_per_node 2 --nnodes 1 --node_rank 0 --master_addr localhost --master_port 6000
+ torchrun --nproc_per_node 2 --nnodes 1 --node_rank 0 --master_addr localhost --master_port 6000 pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 24 --hidden-size 2048 --ffn-hidden-size 5504 --num-attention-heads 16 --micro-batch-size 4 --global-batch-size 32 --seq-length 2048 --max-position-embeddings 2048 --train-iters 2500 --save ./outputs/llama2-1.3b/checkpoints/ --load ./outputs/llama2-1.3b/checkpoints/ --data-path 1 ./dataset/arxiv_train_text_sentence --data-impl mmap --tokenizer-type SentencePieceTokenizer --tokenizer-model ./tokenizer/code10k_en20k_ja30k.ver2.1.model --split 949,50,1 --distributed-backend nccl --lr 3e-4 --lr-decay-style cosine --min-lr 3e-5 --weight-decay 0.1 --clip-grad 1 --lr-warmup-iters 20 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.95 --log-interval 1 --save-interval 10000 --eval-interval 1000 --eval-iters 10 --bf16 --no-query-key-layer-scaling --attention-dropout 0 --hidden-dropout 0 --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear --num-key-value-heads 4
WARNING:torch.distributed.run:
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************
Zarr-based strategies will not be registered because of missing packages
Traceback (most recent call last):
File "/home/mais/pretrain/Megatron-LM/pretrain_gpt.py", line 8, in <module>
Traceback (most recent call last):
File "/home/mais/pretrain/Megatron-LM/pretrain_gpt.py", line 8, in <module>
from megatron import get_args
File "/home/mais/pretrain/Megatron-LM/megatron/__init__.py", line 16, in <module>
from megatron import get_args
File "/home/mais/pretrain/Megatron-LM/megatron/__init__.py", line 16, in <module>
from .initialize import initialize_megatron
File "/home/mais/pretrain/Megatron-LM/megatron/initialize.py", line 18, in <module>
from .initialize import initialize_megatron
File "/home/mais/pretrain/Megatron-LM/megatron/initialize.py", line 18, in <module>
from megatron.arguments import parse_args, validate_args
File "/home/mais/pretrain/Megatron-LM/megatron/arguments.py", line 16, in <module>
from megatron.arguments import parse_args, validate_args
File "/home/mais/pretrain/Megatron-LM/megatron/arguments.py", line 16, in <module>
from megatron.core.models.retro import RetroConfig
File "/home/mais/pretrain/Megatron-LM/megatron/core/models/retro/__init__.py", line 4, in <module>
from megatron.core.models.retro import RetroConfig
File "/home/mais/pretrain/Megatron-LM/megatron/core/models/retro/__init__.py", line 4, in <module>
from .decoder_spec import get_retro_decoder_block_spec
File "/home/mais/pretrain/Megatron-LM/megatron/core/models/retro/decoder_spec.py", line 5, in <module>
from .decoder_spec import get_retro_decoder_block_spec
File "/home/mais/pretrain/Megatron-LM/megatron/core/models/retro/decoder_spec.py", line 5, in <module>
from megatron.core.models.gpt.gpt_layer_specs import (
File "/home/mais/pretrain/Megatron-LM/megatron/core/models/gpt/__init__.py", line 1, in <module>
from megatron.core.models.gpt.gpt_layer_specs import (
File "/home/mais/pretrain/Megatron-LM/megatron/core/models/gpt/__init__.py", line 1, in <module>
from .gpt_model import GPTModel
File "/home/mais/pretrain/Megatron-LM/megatron/core/models/gpt/gpt_model.py", line 17, in <module>
from .gpt_model import GPTModel
File "/home/mais/pretrain/Megatron-LM/megatron/core/models/gpt/gpt_model.py", line 17, in <module>
from megatron.core.transformer.transformer_block import TransformerBlock
File "/home/mais/pretrain/Megatron-LM/megatron/core/transformer/transformer_block.py", line 16, in <module>
from megatron.core.transformer.transformer_block import TransformerBlock
File "/home/mais/pretrain/Megatron-LM/megatron/core/transformer/transformer_block.py", line 16, in <module>
from megatron.core.transformer.custom_layers.transformer_engine import (
File "/home/mais/pretrain/Megatron-LM/megatron/core/transformer/custom_layers/transformer_engine.py", line 7, in <module>
from megatron.core.transformer.custom_layers.transformer_engine import (
File "/home/mais/pretrain/Megatron-LM/megatron/core/transformer/custom_layers/transformer_engine.py", line 7, in <module>
import transformer_engine as te
ModuleNotFoundError: No module named 'transformer_engine'
import transformer_engine as te
ModuleNotFoundError: No module named 'transformer_engine'
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1544321) of binary: /home/mais/.mlm_pretrain/bin/python
Traceback (most recent call last):
File "/home/mais/.mlm_pretrain/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/home/mais/.mlm_pretrain/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/mais/.mlm_pretrain/lib/python3.10/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/home/mais/.mlm_pretrain/lib/python3.10/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/home/mais/.mlm_pretrain/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/mais/.mlm_pretrain/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Then I found https://github.com/NVIDIA/TransformerEngine, but looks like we need to have CUDA>=11.8.
It would be very helpful if you give me advice to sort this out.
Marking as stale. No activity in 60 days.