uniform memory and `@synchronize` inside an if statement
Not really a bug in KA, but @vchuravy asked me to post
When uniform memory is used with a synchronize in an if statement one needs to be careful due to the way that the implicit thread loops arise on the CPU
using KernelAbstractions
using StaticArrays
using Test
# Good: no @synchronize in if statement
@kernel function no_if_uniform_copy!(::Val{N}, A, B) where {N}
@uniform begin
FT = eltype(B)
l_B = MArray{Tuple{N}, FT}(undef)
grp_size = @uniform groupsize()[1]
end
##############################
# Start implicit thread loop?
glo_id = @index(Global)
loc_id = @index(Local)
s_B = @localmem FT (N, grp_size)
# store value of B in uniform memory
for n = 1:N
l_B[n] = B[n, glo_id]
end
# Dump value from uniform to shared
for n = 1:N
s_B[n, loc_id] = l_B[n]
end
# End implicit thread loop?
##############################
@synchronize
##############################
# Start implicit thread loop?
# Dump value from shared to global memory
for n = 1:N
A[n, glo_id] = s_B[n, loc_id]
end
# End implicit thread loop?
##############################
end
# Bad: @synchronize in if statement with uniform memory usage
@kernel function with_if_uniform_copy!(::Val{N}, A, B) where {N}
@uniform begin
FT = eltype(B)
l_B = MArray{Tuple{N}, FT}(undef)
grp_size = @uniform groupsize()[1]
end
##############################
# Start implicit thread loop?
glo_id = @index(Global)
loc_id = @index(Local)
s_B = @localmem FT (N, grp_size)
# store value of B in uniform memory
for n = 1:N
l_B[n] = B[n, glo_id]
end
# End implicit thread loop?
##############################
if true
##############################
# Start implicit thread loop
#
# Dump value from uniform to shared
for n = 1:N
s_B[n, loc_id] = l_B[n]
end
# End implicit thread loop?
##############################
@synchronize
##############################
# Start implicit thread loop?
# Dump value from shared to global memory
for n = 1:N
A[n, glo_id] = s_B[n, loc_id]
end
# End implicit thread loop?
##############################
end
end
# Good: @synchronize in if statement with private memory usage
@kernel function with_if_private_copy!(::Val{N}, A, B) where {N}
@uniform begin
FT = eltype(B)
grp_size = @uniform groupsize()[1]
end
p_B = @private FT (N,)
##############################
# Start implicit thread loop?
glo_id = @index(Global)
loc_id = @index(Local)
s_B = @localmem FT (N, grp_size)
# store value of B in uniform memory
for n = 1:N
p_B[n] = B[n, glo_id]
end
# End implicit thread loop?
##############################
if true
##############################
# Start implicit thread loop?
# Dump value from uniform to shared
for n = 1:N
s_B[n, loc_id] = p_B[n]
end
# End implicit thread loop?
##############################
@synchronize
##############################
# Start implicit thread loop?
# Dump value from shared to global memory
for n = 1:N
A[n, glo_id] = s_B[n, loc_id]
end
# End implicit thread loop?
##############################
end
end
@testset "no if uniform copy" begin
N = 10
M = 1024
B = rand(N, M)
A = similar(B)
event = no_if_uniform_copy!(CPU(), 8)(Val(N), A, B; ndrange = M)
wait(event)
@test A == B
end
@testset "with if private copy" begin
N = 10
M = 1024
B = rand(N, M)
A = similar(B)
event = with_if_private_copy!(CPU(), 8)(Val(N), A, B; ndrange = M)
wait(event)
@test A == B
end
@testset "with if uniform copy" begin
N = 10
M = 1024
B = rand(N, M)
A = similar(B)
event = with_if_uniform_copy!(CPU(), 8)(Val(N), A, B; ndrange = M)
wait(event)
@test A == B
end
output:
julia> include("buggy.jl")
Test Summary: | Pass Total
no if uniform copy | 1 1
Test Summary: | Pass Total
with if private copy | 1 1
with if uniform copy: Test Failed at /Users/jekozdon/scratch/2019_09_17/buggy.jl:167
Expression: A == B
Evaluated: [0.909792046331644 0.909792046331644 … 0.49137323913605546 0.49137323913605546; 0.14386949928286263 0.14386949928286263 … 0.7250834679876621 0.7250834679876621; … ; 0.9275922324520269 0.9275922324520269 … 0.5301867826757798 0.5301867826757798; 0.7105600705440542 0.7105600705440542 … 0.782530472812315 0.782530472812315] == [0.41175116387410693 0.1267238684429859 … 0.9288230234713291 0.49137323913605546; 0.9951976250072363 0.6354672711865443 … 0.0058710270867841086 0.7250834679876621; … ; 0.709797268000828 0.6061527988039019 … 0.188834315207701 0.5301867826757798; 0.9941190027847424 0.0318726131609639 … 0.037656338749129104 0.782530472812315]
Stacktrace:
[1] top-level scope at /Users/jekozdon/scratch/2019_09_17/buggy.jl:167
[2] top-level scope at /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.4/Test/src/Test.jl:1113
[3] top-level scope at /Users/jekozdon/scratch/2019_09_17/buggy.jl:161
Test Summary: | Fail Total
with if uniform copy | 1 1
ERROR: LoadError: Some tests did not pass: 0 passed, 1 failed, 0 errored, 0 broken.
in expression starting at /Users/jekozdon/scratch/2019_09_17/buggy.jl:160
I think I'm running into this too, but with localmem and the error message that I'm getting looks more like what is reported in #262: ERROR: LoadError: UndefVarError: x not defined in local scope.
My if-statement is for filtering out invalid indices, so it seems needed to avoid accessing invalid memory. I'm wondering if re-closing #443 would help with this?
My if-statement is for filtering out invalid indices,
Please take a look at https://juliagpu.github.io/KernelAbstractions.jl/stable/#0.9.34 if you are manually filtering out indices unsafe_indicies=true is likely what you want, but you will be responsible for ensuring that synchronize is executed by all threads in a warp.