dace icon indicating copy to clipboard operation
dace copied to clipboard

Various CUDA codegen errors

Open sancierra opened this issue 5 years ago • 2 comments

Two codegen errors on CUDA arch:

  • Copy from SharedMemory to Global Memory does not find suitable constructor (see fun bug1)
  • Basic operators such as '-' or math library operators such as 'exp' raise an overload error in FP16 mode (see fun bug2)
import dace
import numpy as np
from copy import deepcopy as dcpy

import dace.dtypes as dtypes

import dace.libraries.standard as stdlib



d32 = dace.float32
d16 = dace.float16
N = dace.symbol('N')
N.set(100)

@dace.program
def TEST(X: d32[N,N]):
    result = np.ndarray([N], dtype = d32)
    for i in dace.map[0:N]:
        tmp = dace.reduce(lambda a,b: a+b, X[i,:], identity = 0)
        result[i] = tmp
    return result
@dace.program
def TEST2(X: d16[N]):
    for i in dace.map[0:N]:
        with dace.tasklet:
            input << X[i]
            output >> X[i]

            output = math.exp(input - 42)



def bug1():
    sdfg = TEST.to_sdfg()
    sdfg.apply_gpu_transformations()
    graph = sdfg.nodes()[0]
    for node in graph.nodes():
        if isinstance(node, dace.sdfg.nodes.Tasklet):
            rm_node = node
        if isinstance(node, stdlib.nodes.Reduce):
            node.implementation = 'pure'
            sdfg.data(graph.out_edges(node)[0].dst.data).storage = dace.dtypes.StorageType.GPU_Shared


    graph.add_edge(u = graph.in_edges(rm_node)[0].src, u_connector = None,
                   v = graph.out_edges(rm_node)[0].dst, v_connector = 'IN_1',
                   memlet = dcpy(graph.out_edges(rm_node)[0].data))

    graph.remove_node(rm_node)

    sdfg.view()
    A = np.random.rand(N.get(), N.get()).astype(np.float32)
    csdfg = sdfg.compile()
    csdfg(A=A, N=N)

def bug2():
    sdfg = TEST2.to_sdfg()
    sdfg.apply_gpu_transformations()
    A = np.random.rand(N.get()).astype(np.float16)
    csdfg = sdfg.compile()
    csdfg(A=A, N=N)

if __name__ == '__main__':
    bug1()
    bug2()

sancierra avatar Jul 02 '20 09:07 sancierra

Some of the half-precision issues have already been solved in #186 (not all though)

tbennun avatar Jul 02 '20 19:07 tbennun

  • fun1: This happens because the block size is set to max(1, N), due to the fact that the reduce node happens, by default, on shared memory. I'm taking care of that.
  • fun2: This is a bug in your tasklet - 42 has to be explicitly cast to a half: output = math.exp(input - dace.float16(42))

tbennun avatar Sep 30 '20 08:09 tbennun