dace
dace copied to clipboard
Various CUDA codegen errors
Two codegen errors on CUDA arch:
- Copy from SharedMemory to Global Memory does not find suitable constructor (see fun bug1)
- Basic operators such as '-' or math library operators such as 'exp' raise an overload error in FP16 mode (see fun bug2)
import dace
import numpy as np
from copy import deepcopy as dcpy
import dace.dtypes as dtypes
import dace.libraries.standard as stdlib
d32 = dace.float32
d16 = dace.float16
N = dace.symbol('N')
N.set(100)
@dace.program
def TEST(X: d32[N,N]):
result = np.ndarray([N], dtype = d32)
for i in dace.map[0:N]:
tmp = dace.reduce(lambda a,b: a+b, X[i,:], identity = 0)
result[i] = tmp
return result
@dace.program
def TEST2(X: d16[N]):
for i in dace.map[0:N]:
with dace.tasklet:
input << X[i]
output >> X[i]
output = math.exp(input - 42)
def bug1():
sdfg = TEST.to_sdfg()
sdfg.apply_gpu_transformations()
graph = sdfg.nodes()[0]
for node in graph.nodes():
if isinstance(node, dace.sdfg.nodes.Tasklet):
rm_node = node
if isinstance(node, stdlib.nodes.Reduce):
node.implementation = 'pure'
sdfg.data(graph.out_edges(node)[0].dst.data).storage = dace.dtypes.StorageType.GPU_Shared
graph.add_edge(u = graph.in_edges(rm_node)[0].src, u_connector = None,
v = graph.out_edges(rm_node)[0].dst, v_connector = 'IN_1',
memlet = dcpy(graph.out_edges(rm_node)[0].data))
graph.remove_node(rm_node)
sdfg.view()
A = np.random.rand(N.get(), N.get()).astype(np.float32)
csdfg = sdfg.compile()
csdfg(A=A, N=N)
def bug2():
sdfg = TEST2.to_sdfg()
sdfg.apply_gpu_transformations()
A = np.random.rand(N.get()).astype(np.float16)
csdfg = sdfg.compile()
csdfg(A=A, N=N)
if __name__ == '__main__':
bug1()
bug2()
Some of the half-precision issues have already been solved in #186 (not all though)
- fun1: This happens because the block size is set to
max(1, N), due to the fact that the reduce node happens, by default, on shared memory. I'm taking care of that. - fun2: This is a bug in your tasklet - 42 has to be explicitly cast to a half:
output = math.exp(input - dace.float16(42))