Halide icon indicating copy to clipboard operation
Halide copied to clipboard

Bug when I try to import my absurd SpMV code to the GPU

Open wraith1995 opened this issue 2 years ago • 1 comments

So this might not really be a bug because I think doing this is inadvisable, but that said I'd really like it to work and I think you should like the same because I think it is also pretty cool. Basically, I have a way to do CSR SpMV with Halide and it works fine on the CPU. However, when I try to gpu_tile, it gives me an all zeros answer.

import halide as hl
import scipy
import numpy as np
    

i = hl.Var("i")
block, thread = hl.Var("block"), hl.Var("thread")
@hl.generator(name="spmv")
class SpMV:
    maxPos = hl.InputScalar(hl.Int(32))

    pos = hl.InputBuffer(hl.Int(32), 1)
    idx = hl.InputBuffer(hl.Int(32), 1)
    val = hl.InputBuffer(hl.Float(32), 1)
    x = hl.InputBuffer(hl.Float(32), 1)
    y = hl.OutputBuffer(hl.Float(32), 1)


    def generate(g):
        r = hl.RDom([hl.Range(0, g.maxPos)])
        r.where(r.x < g.pos[i+1] - g.pos[i])
        g.y[i] = 0.0
        rowIdx = g.idx[hl.clamp(g.pos[i] + r.x, 0, g.idx.dim(0).max())] 
        posIdx = hl.clamp(g.pos[i] + r.x, 0, g.val.dim(0).max())
        #  note probably should boundary here instead of clamp.
        g.y[i] += g.x[hl.clamp(rowIdx, 0, g.x.dim(0).max())] * g.val[posIdx] 
        g.y.update(0).gpu_tile(i, block, thread, 16) # works without this.

m = 112
n =m
csr = scipy.sparse.random(m, n, density=0.01, format='csr', dtype=np.float32)

maxPos = 0
for x in range(csr.indptr.shape[0] - 1):
    maxPos = max(maxPos, csr.indptr[x + 1] - csr.indptr[x])        

xx = np.random.rand(m).astype(np.float32)
yy = np.zeros(m, dtype=np.float32)
yp = np.zeros(m, dtype=np.float32)
with hl.GeneratorContext(hl.Target("host-cuda")):
    gen = SpMV()
f = gen.compile_to_callable()
f(maxPos=maxPos, pos=csr.indptr, idx=csr.indices, val=csr.data, x=xx, y=yy)
yp = csr.dot(xx)
assert np.allclose(yp, yy)

wraith1995 avatar Nov 09 '23 21:11 wraith1995

However, when I try to gpu_tile, it gives me an all zeros answer.

You need to copy the resulting buffer back to CPU memory. I totally don't know how that works in the Python wrapper, but in C++ Halide, you need to result_buffer.copy_to_host().

mcourteaux avatar Dec 12 '23 11:12 mcourteaux