KernelAbstractions.jl icon indicating copy to clipboard operation
KernelAbstractions.jl copied to clipboard

uniform memory and `@synchronize` inside an if statement

Open jkozdon opened this issue 5 years ago • 2 comments

Not really a bug in KA, but @vchuravy asked me to post

When uniform memory is used with a synchronize in an if statement one needs to be careful due to the way that the implicit thread loops arise on the CPU

using KernelAbstractions
using StaticArrays
using Test

# Good: no @synchronize in if statement
@kernel function no_if_uniform_copy!(::Val{N}, A, B) where {N}
  @uniform begin
    FT = eltype(B)
    l_B = MArray{Tuple{N}, FT}(undef)
    grp_size = @uniform groupsize()[1]
  end

  ##############################
  # Start implicit thread loop?

  glo_id = @index(Global)
  loc_id = @index(Local)
  s_B = @localmem FT (N, grp_size)

  # store value of B in uniform memory
  for n = 1:N
    l_B[n] = B[n, glo_id]
  end

  # Dump value from uniform to shared
  for n = 1:N
    s_B[n, loc_id] = l_B[n]
  end

  # End implicit thread loop?
  ##############################
  @synchronize
  ##############################
  # Start implicit thread loop?

  # Dump value from shared to global memory
  for n = 1:N
    A[n, glo_id] = s_B[n, loc_id]
  end
  # End implicit thread loop?
  ##############################
end

# Bad: @synchronize in if statement with uniform memory usage
@kernel function with_if_uniform_copy!(::Val{N}, A, B) where {N}
  @uniform begin
    FT = eltype(B)
    l_B = MArray{Tuple{N}, FT}(undef)
    grp_size = @uniform groupsize()[1]
  end

  ##############################
  # Start implicit thread loop?

  glo_id = @index(Global)
  loc_id = @index(Local)
  s_B = @localmem FT (N, grp_size)

  # store value of B in uniform memory
  for n = 1:N
    l_B[n] = B[n, glo_id]
  end

  # End implicit thread loop?
  ##############################

  if true
    ##############################
    # Start implicit thread loop
    #

    # Dump value from uniform to shared
    for n = 1:N
      s_B[n, loc_id] = l_B[n]
    end

    # End implicit thread loop?
    ##############################
    @synchronize
    ##############################
    # Start implicit thread loop?

    # Dump value from shared to global memory
    for n = 1:N
      A[n, glo_id] = s_B[n, loc_id]
    end
    # End implicit thread loop?
    ##############################
  end
end

# Good: @synchronize in if statement with private memory usage
@kernel function with_if_private_copy!(::Val{N}, A, B) where {N}
  @uniform begin
    FT = eltype(B)
    grp_size = @uniform groupsize()[1]
  end

  p_B = @private FT (N,)

  ##############################
  # Start implicit thread loop?

  glo_id = @index(Global)
  loc_id = @index(Local)
  s_B = @localmem FT (N, grp_size)

  # store value of B in uniform memory
  for n = 1:N
    p_B[n] = B[n, glo_id]
  end

  # End implicit thread loop?
  ##############################

  if true
    ##############################
    # Start implicit thread loop?

    # Dump value from uniform to shared
    for n = 1:N
      s_B[n, loc_id] = p_B[n]
    end

    # End implicit thread loop?
    ##############################
    @synchronize
    ##############################
    # Start implicit thread loop?

    # Dump value from shared to global memory
    for n = 1:N
      A[n, glo_id] = s_B[n, loc_id]
    end
    # End implicit thread loop?
    ##############################
  end
end

@testset "no if uniform copy" begin
  N = 10
  M = 1024
  B = rand(N, M)
  A = similar(B)
  event = no_if_uniform_copy!(CPU(), 8)(Val(N), A, B; ndrange = M)
  wait(event)
  @test A == B
end

@testset "with if private copy" begin
  N = 10
  M = 1024
  B = rand(N, M)
  A = similar(B)
  event = with_if_private_copy!(CPU(), 8)(Val(N), A, B; ndrange = M)
  wait(event)
  @test A == B
end

@testset "with if uniform copy" begin
  N = 10
  M = 1024
  B = rand(N, M)
  A = similar(B)
  event = with_if_uniform_copy!(CPU(), 8)(Val(N), A, B; ndrange = M)
  wait(event)
  @test A == B
end

output:

julia> include("buggy.jl")
Test Summary:      | Pass  Total
no if uniform copy |    1      1
Test Summary:        | Pass  Total
with if private copy |    1      1
with if uniform copy: Test Failed at /Users/jekozdon/scratch/2019_09_17/buggy.jl:167
  Expression: A == B
   Evaluated: [0.909792046331644 0.909792046331644 … 0.49137323913605546 0.49137323913605546; 0.14386949928286263 0.14386949928286263 … 0.7250834679876621 0.7250834679876621; … ; 0.9275922324520269 0.9275922324520269 … 0.5301867826757798 0.5301867826757798; 0.7105600705440542 0.7105600705440542 … 0.782530472812315 0.782530472812315] == [0.41175116387410693 0.1267238684429859 … 0.9288230234713291 0.49137323913605546; 0.9951976250072363 0.6354672711865443 … 0.0058710270867841086 0.7250834679876621; … ; 0.709797268000828 0.6061527988039019 … 0.188834315207701 0.5301867826757798; 0.9941190027847424 0.0318726131609639 … 0.037656338749129104 0.782530472812315]
Stacktrace:
 [1] top-level scope at /Users/jekozdon/scratch/2019_09_17/buggy.jl:167
 [2] top-level scope at /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.4/Test/src/Test.jl:1113
 [3] top-level scope at /Users/jekozdon/scratch/2019_09_17/buggy.jl:161
Test Summary:        | Fail  Total
with if uniform copy |    1      1
ERROR: LoadError: Some tests did not pass: 0 passed, 1 failed, 0 errored, 0 broken.
in expression starting at /Users/jekozdon/scratch/2019_09_17/buggy.jl:160

jkozdon avatar Sep 17 '20 15:09 jkozdon

I think I'm running into this too, but with localmem and the error message that I'm getting looks more like what is reported in #262: ERROR: LoadError: UndefVarError: x not defined in local scope.

My if-statement is for filtering out invalid indices, so it seems needed to avoid accessing invalid memory. I'm wondering if re-closing #443 would help with this?

charleskawczynski avatar May 13 '25 22:05 charleskawczynski

My if-statement is for filtering out invalid indices,

Please take a look at https://juliagpu.github.io/KernelAbstractions.jl/stable/#0.9.34 if you are manually filtering out indices unsafe_indicies=true is likely what you want, but you will be responsible for ensuring that synchronize is executed by all threads in a warp.

vchuravy avatar May 14 '25 04:05 vchuravy