训练SegFormer-B1在cityscapes数据集,出现错误“Defaule process group is not initialized”
Open
znshi
opened this issue 4 years ago
•
8 comments
Hi, when I train SegFormer-B1 on cityscapes:
预训练模型路径:
./SegFormer/pretrained/mit_b1.pth
执行脚本:
cd SegFormer
python tools/train.py local_configs/segformer/B1/segformer.b1.1024x1024.city.160k.py
报错信息:
Traceback (most recent call last):
File "tools/train.py", line 166, in
main()
File "tools/train.py", line 162, in main
meta=meta)
File "/home/shizenan/SegFormer/mmseg/apis/train.py", line 115, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run
iter_runner(iter_loaders[i], **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train
outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/parallel/data_parallel.py", line 67, in train_step
return self.module.train_step(*inputs[0], **kwargs[0])
File "/home/shizenan/SegFormer/mmseg/models/segmentors/base.py", line 152, in train_step
losses = self(**data_batch)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 84, in new_func
return old_func(*args, **kwargs)
File "/home/shizenan/SegFormer/mmseg/models/segmentors/base.py", line 122, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/home/shizenan/SegFormer/mmseg/models/segmentors/encoder_decoder.py", line 158, in forward_train
gt_semantic_seg)
File "/home/shizenan/SegFormer/mmseg/models/segmentors/encoder_decoder.py", line 102, in _decode_head_forward_train
self.train_cfg)
File "/home/shizenan/SegFormer/mmseg/models/decode_heads/decode_head.py", line 188, in forward_train
seg_logits = self.forward(inputs)
File "/home/shizenan/SegFormer/mmseg/models/decode_heads/segformer_head.py", line 82, in forward
_c = self.linear_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1))
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/cnn/bricks/conv_module.py", line 195, in forward
x = self.norm(x)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 519, in forward
world_size = torch.distributed.get_world_size(process_group)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 638, in get_world_size
return _get_group_size(group)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 220, in _get_group_size
_check_default_pg()
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 211, in _check_default_pg
"Default process group is not initialized"
AssertionError: Default process group is not initialized
tks, but when I train SegFormer-B1 on cityscapes as you said
执行脚本:
./tools/dist_train.sh local_configs/segformer/B1/segformer.b1.1024x1024.city.160k.py
报错信息:
Traceback (most recent call last):
File "./tools/train.py", line 180, in
main()
File "./tools/train.py", line 105, in main
init_dist(args.launcher, **cfg.dist_params)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/dist_utils.py", line 20, in init_dist
_init_dist_pytorch(backend, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/dist_utils.py", line 34, in _init_dist_pytorch
dist.init_process_group(backend=backend, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 436, in init_process_group
store, rank, world_size = next(rendezvous_iterator)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/rendezvous.py", line 179, in _env_rendezvous_handler
store = TCPStore(master_addr, master_port, world_size, start_daemon, timeout)
RuntimeError: Address already in use
Traceback (most recent call last):
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 260, in
main()
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 256, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/home/shizenan/anaconda3/envs/open-mmlab/bin/python', '-u', './tools/train.py', '--local_rank=1', 'local_configs/segformer/B1/segformer.b1.1024x1024.city.160k.py', '--launcher', 'pytorch']' returned non-zero exit status 1.
(open-mmlab) shizenan@dell-PowerEdge-T640:~/SegFormer$ Traceback (most recent call last):
File "./tools/train.py", line 180, in
main()
File "./tools/train.py", line 105, in main
init_dist(args.launcher, **cfg.dist_params)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/dist_utils.py", line 20, in init_dist
_init_dist_pytorch(backend, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/dist_utils.py", line 34, in _init_dist_pytorch
dist.init_process_group(backend=backend, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 455, in init_process_group
barrier()
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 1960, in barrier
work = _default_pg.barrier()
RuntimeError: NCCL error in: /opt/conda/conda-bld/pytorch_1607370156314/work/torch/lib/c10d/ProcessGroupNCCL.cpp:784, unhandled system error, NCCL version 2.7.8
Traceback (most recent call last):
File "./tools/train.py", line 180, in
main()
File "./tools/train.py", line 105, in main
init_dist(args.launcher, **cfg.dist_params)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/dist_utils.py", line 20, in init_dist
_init_dist_pytorch(backend, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/dist_utils.py", line 34, in _init_dist_pytorch
dist.init_process_group(backend=backend, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 436, in init_process_group
store, rank, world_size = next(rendezvous_iterator)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/rendezvous.py", line 179, in _env_rendezvous_handler
store = TCPStore(master_addr, master_port, world_size, start_daemon, timeout)
RuntimeError: Address already in use
Traceback (most recent call last):
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 260, in
main()
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 256, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/home/shizenan/anaconda3/envs/open-mmlab/bin/python', '-u', './tools/train.py', '--local_rank=1', 'local_configs/segformer/B1/segformer.b1.1024x1024.city.160k.py', '--launcher', 'pytorch']' returned non-zero exit status 1.
(open-mmlab) shizenan@dell-PowerEdge-T640:~/SegFormer$ Traceback (most recent call last):
File "./tools/train.py", line 180, in
main()
File "./tools/train.py", line 105, in main
init_dist(args.launcher, **cfg.dist_params)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/dist_utils.py", line 20, in init_dist
_init_dist_pytorch(backend, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/dist_utils.py", line 34, in _init_dist_pytorch
dist.init_process_group(backend=backend, **kwargs)
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 455, in init_process_group
barrier()
File "/home/shizenan/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 1960, in barrier
work = _default_pg.barrier()
RuntimeError: NCCL error in: /opt/conda/conda-bld/pytorch_1607370156314/work/torch/lib/c10d/ProcessGroupNCCL.cpp:784, unhandled system error, NCCL version 2.7.8
Hi,l have the same problem,l want to train the model with a single-gpu because l have only one gpu,but it reported the same error after l modify 'SyncBN' in norm_cfg to 'BN'.Can you help me to solve it? thank you very much .
Traceback (most recent call last):
File "tools/train.py", line 167, in
main()
File "tools/train.py", line 163, in main
meta=meta)
File "/home/guzhengjie/Demo/SegFormer/mmseg/apis/train.py", line 115, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run
iter_runner(iter_loaders[i], **kwargs)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train
outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/parallel/data_parallel.py", line 67, in train_step
return self.module.train_step(*inputs[0], **kwargs[0])
File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/base.py", line 152, in train_step
losses = self(**data_batch)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 84, in new_func
return old_func(*args, **kwargs)
File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/base.py", line 122, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/encoder_decoder.py", line 153, in forward_train
x = self.extract_feat(img)
File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/encoder_decoder.py", line 79, in extract_feat
x = self.backbone(img)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/guzhengjie/Demo/SegFormer/mmseg/models/backbones/resnet.py", line 635, in forward
x = self.stem(x)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/container.py", line 117, in forward
input = module(input)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 493, in forward
world_size = torch.distributed.get_world_size(process_group)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 620, in get_world_size
return _get_group_size(group)
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 219, in _get_group_size
_check_default_pg()
File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 210, in _check_default_pg
"Default process group is not initialized"
AssertionError: Default process group is not initialized
Hi,l have the same problem,l want to train the model with a single-gpu because l have only one gpu,but it reported the same error after l modify 'SyncBN' in norm_cfg to 'BN'.Can you help me to solve it? thank you very much .
Traceback (most recent call last): File "tools/train.py", line 167, in main() File "tools/train.py", line 163, in main meta=meta) File "/home/guzhengjie/Demo/SegFormer/mmseg/apis/train.py", line 115, in train_segmentor runner.run(data_loaders, cfg.workflow) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run iter_runner(iter_loaders[i], **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train outputs = self.model.train_step(data_batch, self.optimizer, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/parallel/data_parallel.py", line 67, in train_step return self.module.train_step(*inputs[0], **kwargs[0]) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/base.py", line 152, in train_step losses = self(**data_batch) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 84, in new_func return old_func(*args, **kwargs) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/base.py", line 122, in forward return self.forward_train(img, img_metas, **kwargs) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/encoder_decoder.py", line 153, in forward_train x = self.extract_feat(img) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/encoder_decoder.py", line 79, in extract_feat x = self.backbone(img) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/backbones/resnet.py", line 635, in forward x = self.stem(x) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/container.py", line 117, in forward input = module(input) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 493, in forward world_size = torch.distributed.get_world_size(process_group) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 620, in get_world_size return _get_group_size(group) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 219, in _get_group_size _check_default_pg() File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 210, in _check_default_pg "Default process group is not initialized" AssertionError: Default process group is not initialized
I also meet this issue. After i replace the 'SyncBN' in "segformer.b1.512x512.ade.160k.py", "segformer.py" and "seformer_head.py" with 'BN', the error disappear.
Hi,l have the same problem,l want to train the model with a single-gpu because l have only one gpu,but it reported the same error after l modify 'SyncBN' in norm_cfg to 'BN'.Can you help me to solve it? thank you very much .
Traceback (most recent call last): File "tools/train.py", line 167, in main() File "tools/train.py", line 163, in main meta=meta) File "/home/guzhengjie/Demo/SegFormer/mmseg/apis/train.py", line 115, in train_segmentor runner.run(data_loaders, cfg.workflow) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run iter_runner(iter_loaders[i], **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train outputs = self.model.train_step(data_batch, self.optimizer, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/parallel/data_parallel.py", line 67, in train_step return self.module.train_step(*inputs[0], **kwargs[0]) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/base.py", line 152, in train_step losses = self(**data_batch) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 84, in new_func return old_func(*args, **kwargs) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/base.py", line 122, in forward return self.forward_train(img, img_metas, **kwargs) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/encoder_decoder.py", line 153, in forward_train x = self.extract_feat(img) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/encoder_decoder.py", line 79, in extract_feat x = self.backbone(img) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/backbones/resnet.py", line 635, in forward x = self.stem(x) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/container.py", line 117, in forward input = module(input) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 493, in forward world_size = torch.distributed.get_world_size(process_group) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 620, in get_world_size return _get_group_size(group) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 219, in _get_group_size _check_default_pg() File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 210, in _check_default_pg "Default process group is not initialized" AssertionError: Default process group is not initialized
Hi,l have the same problem,l want to train the model with a single-gpu because l have only one gpu,but it reported the same error after l modify 'SyncBN' in norm_cfg to 'BN'.Can you help me to solve it? thank you very much .
Traceback (most recent call last): File "tools/train.py", line 167, in main() File "tools/train.py", line 163, in main meta=meta) File "/home/guzhengjie/Demo/SegFormer/mmseg/apis/train.py", line 115, in train_segmentor runner.run(data_loaders, cfg.workflow) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run iter_runner(iter_loaders[i], **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train outputs = self.model.train_step(data_batch, self.optimizer, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/parallel/data_parallel.py", line 67, in train_step return self.module.train_step(*inputs[0], **kwargs[0]) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/base.py", line 152, in train_step losses = self(**data_batch) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 84, in new_func return old_func(*args, **kwargs) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/base.py", line 122, in forward return self.forward_train(img, img_metas, **kwargs) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/encoder_decoder.py", line 153, in forward_train x = self.extract_feat(img) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/segmentors/encoder_decoder.py", line 79, in extract_feat x = self.backbone(img) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/Demo/SegFormer/mmseg/models/backbones/resnet.py", line 635, in forward x = self.stem(x) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/container.py", line 117, in forward input = module(input) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, **kwargs) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 493, in forward world_size = torch.distributed.get_world_size(process_group) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 620, in get_world_size return _get_group_size(group) File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 219, in _get_group_size _check_default_pg() File "/home/guzhengjie/anaconda3/envs/segformer/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 210, in _check_default_pg "Default process group is not initialized" AssertionError: Default process group is not initialized