dace
dace copied to clipboard
Scheduling and WCR Issues of 'Pure' reduce expansion on CUDA
Expanding a reduce node on a CUDA graph can lead to wrong behaviour, especially when dealing with reduce nodes nested inside another map.
- If the maps' schedules of the expanded reduce nodes are not set to sequential manually, thread indices may appear inside the loop leading to out of bounds access
- If the maps' attribute wcr_conflict of the expanded reduce nodes is not set to False manually, other CUDA errors may occur (719 in the example below)
import dace
import numpy as np
N = dace.symbol('N')
M = dace.symbol('M')
N.set(300); M.set(300)
@dace.program
def TEST(A: dace.float32[M,N]):
return dace.reduce(lambda a, b: max(a,b), A, axis=1, identity = 0)
@dace.program
def TEST2(A: dace.float32[M,N]):
tmp_out = np.ndarray([M], dace.float32)
for i in dace.map[0:M]:
tmp_out[i] = dace.reduce(lambda a,b: max(a,b), A[i,:], identity=0)
return tmp_out
A = np.random.rand(M.get(), N.get()).astype(np.float32)
if __name__ == '__main__':
sdfg1 = TEST.to_sdfg()
sdfg2 = TEST2.to_sdfg()
sdfg1.apply_gpu_transformations()
sdfg2.apply_gpu_transformations()
'''
## sdfg2 does not work here
#return1 = sdfg1.compile()(A=A, N=N, M=M)
#return2 = sdfg2.compile()(A=A, N=N, M=M)
#print(np.linalg.norm(return1))
#print(np.linalg.norm(return2))
'''
sdfg1.expand_library_nodes()
sdfg2.expand_library_nodes()
# fix for getting sdfg2 to work
for sdfg in [sdfg2]:
for node in sdfg.nodes()[0].nodes():
if isinstance(node, dace.sdfg.nodes.NestedSDFG):
for state in node.sdfg.nodes():
for snode in state.nodes():
for e in state.out_edges(snode):
e.data.wcr_conflict = False
if isinstance(snode, dace.sdfg.nodes.MapEntry):
snode.schedule = dace.dtypes.ScheduleType.Sequential
return1 = sdfg1.compile()(A=A, N=N, M=M)
return2 = sdfg2.compile()(A=A, N=N, M=M)
print(np.linalg.norm(return1))
print(np.linalg.norm(return2))
Reduction as library nodes is so delicate these days... Thanks!