[BUG] Always get errno: 110 - Connection timed out when using deepspeed multi-node training.
Describe the bug I'm trying to use deepspeed to finetune a bert based classification model, but when trying to launch multi-node training all nodes include localhost get errno: 110 - Connection timed out.
To Reproduce Here is my code:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
import argparse
import deepspeed
import random
import numpy as np
from utils import get_train_ds_config
def parse_args():
parser = argparse.ArgumentParser(
description=
"Finetune a transformers model on a multi-modal task")
parser.add_argument(
"--learning_rate",
type=float,
default=1e-3,
help=
"Initial learning rate (after the potential warmup period) to use.",
)
parser.add_argument(
"--learning_rate_pretraining_components",
type=float,
default=0,
help=
"Initial learning rate for pre-trained weight, e.g., embedding (after the potential warmup period) to use.",
)
parser.add_argument(
"--per_device_train_batch_size",
type=int,
default=8,
help="Batch size (per device) for the training dataloader.",
)
parser.add_argument("--weight_decay",
type=float,
default=0.,
help="Weight decay to use.")
parser.add_argument("--num_train_epochs",
type=int,
default=6,
help="Total number of training epochs to perform.")
parser.add_argument(
"--gradient_accumulation_steps",
type=int,
default=1,
help=
"Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument(
"--lr_scheduler_type",
default="cosine",
help="The scheduler type to use.",
choices=[
"linear", "cosine", "cosine_with_restarts", "polynomial",
"constant", "constant_with_warmup"
],
)
parser.add_argument(
"--num_warmup_steps",
type=float,
default=0.1,
help="Number of steps (>1) or ratios (<=1) for the warmup in the lr scheduler.")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument(
"--lm_model_name_or_path",
type=str,
default='baichuan-inc/Baichuan2-13B-Chat',
# baichuan-inc/Baichuan2-13B-Base /raid/models/med_vqa/LLama/Llama-2-13b-chat-hf
help=
"Path to pretrained model or model identifier from huggingface.co/models.",
# required=True,
)
# deepspeed features
parser.add_argument(
'--zero_stage',
type=int,
default=2,
help='ZeRO optimization stage for Actor model (and clones).')
parser.add_argument(
"--precision",
type=str,
choices=["fp16", "bf16"],
default="fp16",
help=
"FP16 or BF16 precision. FP16 is recommended for typical use cases. BF16 is good for large models",
)
parser.add_argument("--output_dir",
type=str,
default='./temp_output/',
help="Where to store the model.")
parser.add_argument('--enable_tensorboard',
action='store_true',
default=False,
help='Enable tensorboard logging')
## LoRA for efficient training setting
parser.add_argument("--lang_lora_dim",
type=int,
default=0,
help="Use LoRA for fine-tuning language decoder (> 0).")
parser.add_argument("--lang_lora_module_name",
type=str,
default="model.layers.",
help="The scope name of the target LoRA parameters.")
parser.add_argument('--only_optimize_lora',
action='store_true',
help='Only optimize the LoRA parameters.')
parser = deepspeed.add_config_arguments(parser)
args = parser.parse_args()
if args.learning_rate_pretraining_components == 0.0:
# if we do not provide special learning rate, mainly for embedding, the same lr is applied
args.learning_rate_pretraining_components = args.learning_rate
assert args.num_warmup_steps >= 0, "--num_warmup_steps must be >= 0"
return args
def main():
args = parse_args()
if args.local_rank == -1:
device = torch.device("cuda")
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
deepspeed.init_distributed()
args.global_rank = torch.distributed.get_rank()
ds_config = get_train_ds_config(args, offload=True,
stage=args.zero_stage)
ds_config[
'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size
ds_config[
'train_batch_size'] = args.per_device_train_batch_size * torch.distributed.get_world_size(
) * args.gradient_accumulation_steps
torch.distributed.barrier()
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
# Prepare Dataset
dataset = load_from_disk("bert_train_data")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
dataset['train'] = dataset["train"].shuffle(seed=42).select(range(2000))
dataset['test'] = dataset["test"].shuffle(seed=42).select(range(2000))
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=args.per_device_train_batch_size)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=args.per_device_train_batch_size)
# Finish Dataset loading
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=num_training_steps
)
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
model, optimizer, _, lr_scheduler = deepspeed.initialize(
model=model,
optimizer=optimizer,
args=args,
config=ds_config,
lr_scheduler=lr_scheduler,
dist_init_required=True)
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
model.backward(loss)
# loss.backward()
model.step()
# lr_scheduler.step()
# optimizer.zero_grad()
progress_bar.update(1)
client_state = {
'random_rng_state': random.getstate(),
'np_rng_state': np.random.get_state(),
'torch_rng_state': torch.get_rng_state(),
'torch_cuda_rng_state': torch.cuda.get_rng_state(),
'epoch': epoch + 1, # start from next epoch
}
model.save_checkpoint(args.output_dir, client_state=client_state) # save to the latest
print('FINISH')
if __name__ == "__main__":
main()
Expected behavior I have tried to launch multi-card training and successfully get result, and I want to launch training process in multi-node environment.
ds_report output
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
runtime if needed. Op compatibility means that your system
meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
async_io ............... [NO] ....... [NO]
fused_adam ............. [NO] ....... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_lion ............... [NO] ....... [OKAY]
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
evoformer_attn ......... [NO] ....... [NO]
fused_lamb ............. [NO] ....... [OKAY]
fused_lion ............. [NO] ....... [OKAY]
inference_core_ops ..... [NO] ....... [OKAY]
cutlass_ops ............ [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
ragged_device_ops ...... [NO] ....... [OKAY]
ragged_ops ............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0
[WARNING] using untested triton version (2.0.0), only 1.0.0 is known to be compatible
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/root/anaconda3/envs/ds/lib/python3.10/site-packages/torch']
torch version .................... 2.0.1+cu117
deepspeed install path ........... ['/root/anaconda3/envs/ds/lib/python3.10/site-packages/deepspeed']
deepspeed info ................... 0.12.5, unknown, unknown
torch cuda version ............... 11.7
torch hip version ................ None
nvcc version ..................... 10.1
deepspeed wheel compiled w. ...... torch 2.0, cuda 11.7
shared memory (/dev/shm) size .... 251.83 GB
Screenshots
System info (please complete the following information):
- OS: Ubuntu 20.04.6 LTS
- GPU count and types: Tesla V100-PCIE-16G
- Interconnects: all machine can ssh without passward.
- Python version: 3.9
- I have unset http_proxy and https_proxy
Launcher context
deepspeed -H=host.txt --master_port=23 bert_ft.py
host.txt as follow:
localhost slots=8
xx.xx.xx.xx slots=8
Additional context I have changed master_port but can't solve this problem.
Hi @Luoyang144 could you try running a very simple script in multinode and let me know if that works? Something like:
# example.py
print("Hello World!")
and run that with the same command: deepspeed -H=host.txt --master_port=23 example.py
Hi @mrwyattii It successfully works(same code as your example).
I met the same issue. Have you find a solution? @Luoyang144
I have the same issue!
I have the same error