oneflow
oneflow copied to clipboard
ddp模式打开flow.cuda.synchronize(), oneflow会卡死
Summary
ddp模式打开flow.cuda.synchronize(), oneflow会卡死
Code to reproduce bug
flow-time_sync.py
"""
Usage:
$ python flow-time_sync.py
$ python -m oneflow.distributed.launch --nproc_per_node 2 flow-time_sync.py
"""
import random
import oneflow as flow
import oneflow.nn as nn
from oneflow.nn.parallel import DistributedDataParallel as DDP
import flowvision
import time
import os
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html
RANK = int(os.getenv('RANK', -1))
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
def is_parallel(model):
# Returns True if model is of type DP or DDP
# return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
return type(model) in (nn.parallel.DistributedDataParallel,)
def de_parallel(model):
# De-parallelize a model: returns single-GPU model if model is of type DP or DDP
return model.module if is_parallel(model) else model
def smart_DDP(model):
return DDP(model)
def time_sync():
# oneflow-accurate time
if flow.cuda.is_available():
flow.cuda.synchronize()
return time.time()
def run():
dt = [0.0, 0.0, 0.0]
print("on_val_start")
for batch_i in range(4):
t1 = time_sync()
t2 = time_sync()
dt[0] += t2 - t1
dt[1] += time_sync() - t2
t3 = time_sync()
dt[2] += time_sync() - t3
def train(device):
m = flowvision.models.resnet50().to(device)
print("training..")
# DDP mode
if RANK != -1:
m = smart_DDP(m)
for epoch in range(2): # epoch ------------------------------------------------------------------
if RANK in {-1, 0}:
print("epoch:",epoch)
# batch
time.sleep(3)
x = flow.zeros((1, 3, 224, 224)).to(device)
y = m(x)
y.sum().backward()
if RANK in {-1, 0}:
# Calculate mAp
run()
time.sleep(1)
flow.cuda.empty_cache()
return
def main():
flow.cuda.set_device(LOCAL_RANK)
device = flow.device('cuda', LOCAL_RANK)
train(device)
print('退出'*50)
if WORLD_SIZE > 1 and RANK == 0:
print('Destroying process group... ')
exit(0)
if __name__ == "__main__":
main()
启动指令
1: python flow-time_sync.py
Pandas输出(点击展开)
loaded library: /lib/x86_64-linux-gnu/libibverbs.so.1
training..
epoch: 0
on_val_start
epoch: 1
on_val_start
退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出退出
2: python -m oneflow.distributed.launch --nproc_per_node 2 flow-time_sync.py
程序卡死
Pandas输出(点击展开)
loaded library: /lib/x86_64-linux-gnu/libibverbs.so.1
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************
loaded library: loaded library: /lib/x86_64-linux-gnu/libibverbs.so.1/lib/x86_64-linux-gnu/libibverbs.so.1
training..
training..
epoch: 0
on_val_start
System Information
- What is your OneFlow installation (pip, source, dockerhub):
- OS: oneflow25-root
- OneFlow version (run
python3 -m oneflow --doctor): '0.8.1.dev20221027+cu112' - Python version:3.8
- CUDA driver version:
- GPU models:
- Other info: