Halide
Halide copied to clipboard
Bug when I try to import my absurd SpMV code to the GPU
So this might not really be a bug because I think doing this is inadvisable, but that said I'd really like it to work and I think you should like the same because I think it is also pretty cool. Basically, I have a way to do CSR SpMV with Halide and it works fine on the CPU. However, when I try to gpu_tile, it gives me an all zeros answer.
import halide as hl
import scipy
import numpy as np
i = hl.Var("i")
block, thread = hl.Var("block"), hl.Var("thread")
@hl.generator(name="spmv")
class SpMV:
maxPos = hl.InputScalar(hl.Int(32))
pos = hl.InputBuffer(hl.Int(32), 1)
idx = hl.InputBuffer(hl.Int(32), 1)
val = hl.InputBuffer(hl.Float(32), 1)
x = hl.InputBuffer(hl.Float(32), 1)
y = hl.OutputBuffer(hl.Float(32), 1)
def generate(g):
r = hl.RDom([hl.Range(0, g.maxPos)])
r.where(r.x < g.pos[i+1] - g.pos[i])
g.y[i] = 0.0
rowIdx = g.idx[hl.clamp(g.pos[i] + r.x, 0, g.idx.dim(0).max())]
posIdx = hl.clamp(g.pos[i] + r.x, 0, g.val.dim(0).max())
# note probably should boundary here instead of clamp.
g.y[i] += g.x[hl.clamp(rowIdx, 0, g.x.dim(0).max())] * g.val[posIdx]
g.y.update(0).gpu_tile(i, block, thread, 16) # works without this.
m = 112
n =m
csr = scipy.sparse.random(m, n, density=0.01, format='csr', dtype=np.float32)
maxPos = 0
for x in range(csr.indptr.shape[0] - 1):
maxPos = max(maxPos, csr.indptr[x + 1] - csr.indptr[x])
xx = np.random.rand(m).astype(np.float32)
yy = np.zeros(m, dtype=np.float32)
yp = np.zeros(m, dtype=np.float32)
with hl.GeneratorContext(hl.Target("host-cuda")):
gen = SpMV()
f = gen.compile_to_callable()
f(maxPos=maxPos, pos=csr.indptr, idx=csr.indices, val=csr.data, x=xx, y=yy)
yp = csr.dot(xx)
assert np.allclose(yp, yy)
However, when I try to gpu_tile, it gives me an all zeros answer.
You need to copy the resulting buffer back to CPU memory. I totally don't know how that works in the Python wrapper, but in C++ Halide, you need to result_buffer.copy_to_host().