Unsigned array bounds break in CUDA
TLDR: defining array sizes with unsigned integers and then tiling the resulting kernel will break the per-thread check to see if it is inside the loop domain. This is because the check is of the form if (N - something >= 0) {, which is always true if N is unsigned (at least in my testing)
Hefty mvp to reproduce (including PyCuda boilerplate)
import numpy as np
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
domain = '{ [ic, jc] : 2 <= ic < nxc and 2 <= jc < nyc }'
body = '''
for jc, ic
qc[ic-1, jc-1] = (jc-1) + (ic-1)*nxc
end
'''
arguments_unsigned = [
lp.ValueArg('nxc', dtype=np.uint32),
lp.ValueArg('nyc', dtype=np.uint32),
lp.ArrayArg('qc', dtype=np.float64,
address_space=lp.AddressSpace.GLOBAL,
shape=('nxc', 'nyc'))
]
arguments_signed = [
lp.ValueArg('nxc', dtype=np.int32),
lp.ValueArg('nyc', dtype=np.int32),
lp.ArrayArg('qc', dtype=np.float64,
address_space=lp.AddressSpace.GLOBAL,
shape=('nxc', 'nyc'))
]
tile_size = 32
# Boilerplate code ...
def split_inames(kernel):
kernel = lp.split_iname(kernel, 'ic', tile_size, outer_tag="g.0", inner_tag="l.0")
kernel = lp.split_iname(kernel, 'jc', tile_size, outer_tag="g.1", inner_tag="l.1")
return kernel
def print_code(kernel):
kernel = kernel.copy(target=lp.CudaTarget())
cgr = lp.generate_code_v2(kernel)
print(cgr.device_code())
def run_cuda_kernel(kernel, signed, nxc=6, nyc=6):
# Grab device code from Loopy
kernel = kernel.copy(target=lp.CudaTarget())
cgr = lp.generate_code_v2(kernel)
dev_code = cgr.device_code()
print('device code:')
print(dev_code)
# Compile kernel w/ PyCuda
module = SourceModule(dev_code)
cuda_fn = module.get_function('loopy_kernel')
# Allocate out memory
qch = np.empty((nxc, nyc))
qc = cuda.mem_alloc(8 * nxc * nyc)
# Set launch arguments
grid = (1, 1, 1)
block = (tile_size, tile_size, 1)
if signed:
nxc = np.intc(nxc)
nyc = np.intc(nyc)
else:
nxc = np.uintc(nxc)
nyc = np.uintc(nyc)
cuda_fn(nxc, nyc, qc, grid=grid, block=block)
cuda.memcpy_dtoh(qch, qc)
print('output buffer:')
print(qch)
# Define kernels
print(' -- unsigned array limits -- ')
kernel_unsigned = split_inames(lp.make_kernel(
domains=domain, instructions=body, kernel_data=arguments_unsigned
))
run_cuda_kernel(kernel_unsigned, False)
print()
print()
print(' -- signed array limits -- ')
kernel_signed = split_inames(lp.make_kernel(
domains=domain, instructions=body, kernel_data=arguments_signed
))
run_cuda_kernel(kernel_signed, True)
For the unsigned case, I get the following device code:
extern "C" __global__ void __launch_bounds__(1024) loopy_kernel(unsigned const nxc, unsigned const nyc, double *__restrict__ qc)
{
if (-1 + -32 * bIdx(y) + -1 * tIdx(y) + nyc >= 0 && -2 + 32 * bIdx(x) + tIdx(x) >= 0 && -2 + 32 * bIdx(y) + tIdx(y) >= 0 && -1 + -32 * bIdx(x) + -1 * tIdx(x) + nxc >= 0)
qc[nyc * (-1 + 32 * bIdx(x) + tIdx(x)) + -1 + 32 * bIdx(y) + tIdx(y)] = (tIdx(x) + bIdx(x) * 32.0 + -1.0) * (tIdx(y) + bIdx(y) * 32.0 + -1.0) * nxc;
}
which specifically breaks on the parts on the if statement. Expressions like -1 + -32 * bIdx(y) + -1 * tIdx(y) + nyc>=0 are always true in unsigned arithmetic, so the above kernel will go past the correct array bounds.
Output I get with the unsigned kernel:
[[ 0. 0. 0. 0. 0. 0.]
[ 0. 7. 8. 9. 10. 11.]
[12. 13. 14. 15. 16. 17.]
[18. 19. 20. 21. 22. 23.]
[24. 25. 26. 27. 28. 29.]
[30. 31. 32. 33. 34. 35.]]
versus what I (correctly) get with a signed version, which is a ring of 0's around the exterior:
[[ 0. 0. 0. 0. 0. 0.]
[ 0. 7. 8. 9. 10. 0.]
[ 0. 13. 14. 15. 16. 0.]
[ 0. 19. 20. 21. 22. 0.]
[ 0. 25. 26. 27. 28. 0.]
[ 0. 0. 0. 0. 0. 0.]]
An easy fix is to have the code generator cast the limits to signed ... (signed) Nxc and (signed) Nyc) ... in the if statement. That seems to work on my end.
Index arithmetic in loopy is always assumed to be signed, but we're not doing a good job checking that. I agree that casting to signed in integer expressions is probably the cleanest way out of this.