CLBLAS.jl icon indicating copy to clipboard operation
CLBLAS.jl copied to clipboard

High-level API for norms and inner products

Open ranocha opened this issue 7 years ago • 0 comments

It would be very nice to have high-level methods of norm and dot for https://github.com/JuliaGPU/GPUArrays.jl/issues/66 and https://github.com/JuliaGPU/GPUArrays.jl/issues/122.

It seems to be possible to get some working version of high-level functions by adding to highlevel.jl:

## NRM2
import Base.LinAlg.BLAS: nrm2

for (func, elty) in [(:clblasSnrm2, Float32), (:clblasDnrm2, Float64),
                    (:clblasCnrm2, CL_float2), (:clblasZnrm2, CL_double2)]

    @eval function nrm2(n::Integer, x::CLArray{$elty}, incx::Integer;
                        queue=cl.queue(x))
        # need temporary buffers
        ctx = cl.context(x)
        norm2_buff = cl.Buffer($elty, ctx, :w, 1)
        scratch_buff = cl.Buffer($elty, ctx, :rw, 2*length(x))

        $func(Csize_t(n), pointer(norm2_buff), Csize_t(0), pointer(x), Csize_t(0), Cint(incx),
              pointer(scratch_buff), [queue])

        # read return value
        result = Vector{$elty}(1)
        cl.enqueue_read_buffer(queue, norm2_buff, result, Csize_t(0), nothing, true)
        @inbounds norm2 = result[1]

        return norm2
    end

end

However, this seems to be far from optimal. The corresponding clBLAS functions use a temporary buffer scratch_buff that has to be allocated for each call. Here are some benchmarks using the implementation above (I did not make a PR since I think it is too bad):

julia> using CuArrays, CLArrays, GPUArrays, BenchmarkTools

julia> v = rand(Float32, 100^3); dvu = CuArray(v); dvl = CLArray(v);

julia> @benchmark norm($v)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     225.994 μs (0.00% GC)
  median time:      226.209 μs (0.00% GC)
  mean time:        226.688 μs (0.00% GC)
  maximum time:     337.346 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark norm($dvu)
BenchmarkTools.Trial: 
  memory estimate:  128 bytes
  allocs estimate:  2
  --------------
  minimum time:     55.020 μs (0.00% GC)
  median time:      64.425 μs (0.00% GC)
  mean time:        62.889 μs (0.00% GC)
  maximum time:     582.534 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark sqrt(mapreduce(abs2, +, dvl))
BenchmarkTools.Trial: 
  memory estimate:  13.03 KiB
  allocs estimate:  298
  --------------
  minimum time:     226.478 μs (0.00% GC)
  median time:      251.947 μs (0.00% GC)
  mean time:        261.367 μs (1.16% GC)
  maximum time:     17.595 ms (29.41% GC)
  --------------
  samples:          10000
  evals/sample:     1

ulia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
  0.001456 seconds (83 allocations: 2.438 KiB)
577.3568f0

julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
  0.000734 seconds (83 allocations: 2.438 KiB)
577.3568f0

julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
  0.001154 seconds (83 allocations: 2.438 KiB)
577.3568f0

julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
  0.001699 seconds (83 allocations: 2.438 KiB)
577.3568f0

julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
  0.000760 seconds (83 allocations: 2.438 KiB)
577.3568f0

I have used @time for the last tests, since I get the following error if I run @benchmark on LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1):

julia> @benchmark LinAlg.BLAS.nrm2(length($dvl), GPUArrays.blasbuffer($dvl), 1)
ERROR: CLError(code=-4, CL_MEM_OBJECT_ALLOCATION_FAILURE)
Stacktrace:
 [1] #clblasSnrm2#119(::Array{Ptr{Void},1}, ::Function, ::UInt64, ::Ptr{Void}, ::UInt64, ::Ptr{Void}, ::UInt64, ::Int32, ::Ptr{Void}, ::Array{OpenCL.cl.CmdQueue,1}) at /home/.../.julia/v0.6/CLBLAS/src/macros.jl:132
 [2] #nrm2#451(::OpenCL.cl.CmdQueue, ::Function, ::Int64, ::OpenCL.cl.CLArray{Float32,1}, ::Int64) at /home/.../.julia/v0.6/CLBLAS/src/highlevel.jl:57
 [3] ##core#743(::CLArrays.CLArray{Float32,1}, ::CLArrays.CLArray{Float32,1}) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:316
 [4] ##sample#744(::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:324
 [5] #_lineartrial#23(::Int64, ::Array{Any,1}, ::Function, ::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:92
 [6] _lineartrial(::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:84
 [7] #lineartrial#20(::Array{Any,1}, ::Function, ::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:47
 [8] #tune!#26(::Bool, ::String, ::Array{Any,1}, ::Function, ::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:156
 [9] tune!(::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:155

ranocha avatar Jun 07 '18 15:06 ranocha