I used the command of "python -m torch.distributed.launch --nproc_per_node=1 ./tools/train.py ./configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn.py ", and somethings happened.
Traceback (most recent call last):
File "./tools/train.py", line 168, in
main()
File "./tools/train.py", line 163, in main
logger=logger,
File "/home/duke/桌面/lidar/CenterPoint/det3d/torchie/apis/train.py", line 326, in train_detector
trainer.run(data_loaders, cfg.workflow, cfg.total_epochs, local_rank=cfg.local_rank)
File "/home/duke/桌面/lidar/CenterPoint/det3d/torchie/trainer/trainer.py", line 542, in run
epoch_runner(data_loaders[i], self.epoch, **kwargs)
File "/home/duke/桌面/lidar/CenterPoint/det3d/torchie/trainer/trainer.py", line 409, in train
self.model, data_batch, train_mode=True, **kwargs
File "/home/duke/桌面/lidar/CenterPoint/det3d/torchie/trainer/trainer.py", line 367, in batch_processor_inline
losses = model(example, return_loss=True)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/duke/桌面/lidar/CenterPoint/det3d/models/detectors/voxelnet.py", line 56, in forward
x, _ = self.extract_feat(example)
File "/home/duke/桌面/lidar/CenterPoint/det3d/models/detectors/voxelnet.py", line 47, in extract_feat
input_features, data["coors"], data["batch_size"], data["input_shape"]
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/duke/桌面/lidar/CenterPoint/det3d/models/backbones/scn.py", line 170, in forward
x = self.conv_input(ret)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/spconv/modules.py", line 123, in forward
input = module(input)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/spconv/conv.py", line 155, in forward
self.stride, self.padding, self.dilation, self.output_padding, self.subm, self.transposed, grid=input.grid)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/spconv/ops.py", line 89, in get_indice_pairs
stride, padding, dilation, out_padding, int(subm), int(transpose))
RuntimeError: /home/duke/桌面/lidar/spconv/src/spconv/indice.cu 125
cuda execution failed with error 2
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 27841) of binary: /home/duke/miniconda3/envs/deep-learning/bin/python
Traceback (most recent call last):
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/torch/distributed/launch.py", line 193, in
main()
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/torch/distributed/run.py", line 692, in run
)(*cmd_args)
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 116, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/duke/miniconda3/envs/deep-learning/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
./tools/train.py FAILED
=======================================
Root Cause:
[0]:
time: 2022-04-09_19:26:11
rank: 0 (local_rank: 0)
exitcode: 1 (pid: 27841)
error_file: <N/A>
msg: "Process failed with exitcode 1"
Other Failures:
<NO_OTHER_FAILURES>
I have found some solutions and found it is the problem of the memory of my gpu. So can you tell me how to change the batch_size?