oneAPI.jl icon indicating copy to clipboard operation
oneAPI.jl copied to clipboard

resize! broken on PVC

Open maleadt opened this issue 1 year ago • 1 comments

Error in testset array:
Test Failed at /home/sdp/.julia/dev/oneAPI/test/array.jl:83
  Expression: Array(a) == [1, 2, 3]
   Evaluated: [0, 0, 0] == [1, 2, 3]

Error in testset array:
Test Failed at /home/sdp/.julia/dev/oneAPI/test/array.jl:87
  Expression: (Array(a))[1:3] == [1, 2, 3]
   Evaluated: [0, 0, 0] == [1, 2, 3]

Error in testset array:
Test Failed at /home/sdp/.julia/dev/oneAPI/test/array.jl:91
  Expression: (Array(a))[1:2] == [1, 2]
   Evaluated: [0, 0] == [1, 2]

maleadt avatar May 15 '24 16:05 maleadt

That's too bad, any guess as to what's going on?

mtfishman avatar May 16 '24 18:05 mtfishman

Not really, this almost looks like a Julia CPU codegen issue. I currently have the following reduced MWE:

using oneAPI

# disable the GC to make sure this isn't a finalizer issue
GC.enable(false)

# global stuff
ctx = context()
dev = device()
queue = oneAPI.global_queue(ctx, dev)

# allocate and free a GPU array
gpu1 = oneArray{Float32}(undef, 1)
oneAPI.unsafe_free!(gpu1)

# allocate and resize another GPU array
gpu2 = oneArray{Float32}(undef, 1)
(function ()
    buf = oneAPI.allocate(oneL0.DeviceBuffer, ctx, dev, 4, 4)
    ptr = convert(oneL0.ZePtr{Float32}, buf)
    unsafe_copyto!(ctx, dev, ptr, pointer(gpu2), 1)
    new_data = oneAPI.DataRef(buf) do buf
        # do nothing
    end
    oneAPI.unsafe_free!(gpu2)

    gpu2.data = new_data
end)()
gpu2ptr = pointer(gpu2)

# copy to the CPU
cpu = Array{Float32}(undef, 1)
cpuptr = pointer(cpu)
list = oneL0.ZeCommandList(ctx, dev, queue.ordinal)
oneL0.append_copy!(list, cpuptr, gpu2ptr, 4)
close(list)
ccall(:jl_breakpoint, Cvoid, (Any,), list)

try
    oneL0.execute!(queue, [list])
    println("Success")
catch err
    Base.showerror(stdout, err)
    println()
finally
    # exit manually to avoid running finalizers, to simplify the API trace
    ccall(:exit, Cvoid, (Cint,), 0)
end

ERROR: LoadError: ZeError: unknown or internal error (code 2147483646, ZE_RESULT_ERROR_UNKNOWN)
Stacktrace:
 [1] throw_api_error(res::oneAPI.oneL0._ze_result_t)
   @ oneAPI.oneL0 ~/oneAPI/lib/level-zero/libze.jl:8
 [2] check
   @ ~/oneAPI/lib/level-zero/libze.jl:19 [inlined]
 [3] zeCommandQueueExecuteCommandLists
   @ ~/oneAPI/lib/utils/call.jl:24 [inlined]
 [4] execute!
   @ ~/oneAPI/lib/level-zero/cmdlist.jl:50 [inlined]
 [5] execute!(queue::oneAPI.oneL0.ZeCommandQueue, lists::Vector{oneAPI.oneL0.ZeCommandList})
   @ oneAPI.oneL0 ~/oneAPI/lib/level-zero/cmdlist.jl:50
 [6] top-level scope
   @ ~/oneAPI/wip.jl:37
in expression starting at /home/sdp/oneAPI/wip.jl:37

Changing minor things, like executing the resize function at top level, or running with --compile=min, results in success... Tested on 1.10 and 1.11.

maleadt avatar Aug 12 '24 11:08 maleadt

MWE version with raw API calls:

using oneAPI

drv = Ref{oneL0.ze_driver_handle_t}()
oneL0.zeDriverGet(Ref(UInt32(1)), drv)

ctx = Ref{oneL0.ze_context_handle_t}()
oneL0.zeContextCreate(drv[], Ref(oneL0.ze_context_desc_t()), ctx)

dev = Ref{oneL0.ze_device_handle_t}()
oneL0.zeDeviceGet(drv[], Ref(UInt32(1)), dev)

queue = Ref{oneL0.ze_command_queue_handle_t}()
oneL0.zeCommandQueueCreate(ctx[], dev[], Ref(oneL0.ze_command_queue_desc_t()), queue)

gpu1 = Ref{Ptr{Nothing}}()
oneL0.zeMemAllocDevice(ctx[], Ref(oneL0.ze_device_mem_alloc_desc_t()), 4, 4, dev[], gpu1)

oneL0.zeContextMakeMemoryResident(ctx[], dev[], gpu1[], 4)
oneL0.zeContextEvictMemory(ctx[], dev[], gpu1[], 4)
oneL0.zeMemFree(ctx[], gpu1[])

gpu2 = Ref{Ptr{Nothing}}()
oneL0.zeMemAllocDevice(ctx[], Ref(oneL0.ze_device_mem_alloc_desc_t()), 4, 4, dev[], gpu2)
oneL0.zeContextMakeMemoryResident(ctx[], dev[], gpu2[], 4)
(function ()
    buf = Ref{Ptr{Nothing}}()
    oneL0.zeMemAllocDevice(ctx[], Ref(oneL0.ze_device_mem_alloc_desc_t()), 4, 4, dev[], buf)
    oneL0.zeContextMakeMemoryResident(ctx[], dev[], buf[], 4)
    ptr = reinterpret(oneL0.ZePtr{Float32}, buf[])

    list = Ref{oneL0.ze_command_list_handle_t}()
    oneL0.zeCommandListCreate(ctx[], dev[], Ref(oneL0.ze_command_list_desc_t()), list)

    oneL0.zeCommandListAppendMemoryCopy(list[], ptr, gpu2[], 4, C_NULL, 0, C_NULL)
    oneL0.zeCommandListClose(list[])
    oneL0.zeCommandQueueExecuteCommandLists(queue[], 1, [list[]], C_NULL)
    oneL0.zeContextEvictMemory(ctx[], dev[], buf[], 4)
    oneL0.zeMemFree(ctx[], buf[])
end)()

for x in 1:10
    @show x

    list = Ref{oneL0.ze_command_list_handle_t}()
    oneL0.zeCommandListCreate(ctx[], dev[], Ref(oneL0.ze_command_list_desc_t()), list)

    oneL0.zeCommandListClose(list[])
    oneL0.zeCommandQueueExecuteCommandLists(queue[], 1, [list[]], C_NULL)
end

maleadt avatar Aug 12 '24 13:08 maleadt

Even more expanded:

using NEO_jll
using oneAPI_Level_Zero_Loader_jll: libze_loader

# API wrappers
const ze_result_t = UInt32
function zeInit(flags)
    @ccall libze_loader.zeInit(flags::UInt32)::ze_result_t
end
function zeDriverGet(pCount, phDrivers)
    @ccall libze_loader.zeDriverGet(pCount::Ptr{UInt32},
                                    phDrivers::Ptr{Ptr{Cvoid}})::ze_result_t
end
struct ze_context_desc_t
    stype::UInt32
    pNext::Ptr{Cvoid}
    flags::UInt32
end
function zeContextCreate(hDriver, desc, phContext)
    @ccall libze_loader.zeContextCreate(hDriver::Ptr{Cvoid},
                                        desc::Ptr{ze_context_desc_t},
                                        phContext::Ptr{Ptr{Cvoid}})::ze_result_t
end
function zeDeviceGet(hDriver, pCount, phDevices)
    @ccall libze_loader.zeDeviceGet(hDriver::Ptr{Cvoid}, pCount::Ptr{UInt32},
                                    phDevices::Ptr{Ptr{Cvoid}})::ze_result_t
end
struct ze_command_queue_desc_t
    stype::UInt32
    pNext::Ptr{Cvoid}
    ordinal::UInt32
    index::UInt32
    flags::UInt32
    mode::UInt32
    priority::UInt32
end
function zeCommandQueueCreate(hContext, hDevice, desc, phCommandQueue)
    @ccall libze_loader.zeCommandQueueCreate(hContext::Ptr{Cvoid},
                                             hDevice::Ptr{Cvoid},
                                             desc::Ptr{ze_command_queue_desc_t},
                                             phCommandQueue::Ptr{Ptr{Cvoid}})::ze_result_t
end
struct ze_device_mem_alloc_desc_t
    stype::UInt32
    pNext::Ptr{Cvoid}
    flags::UInt32
    ordinal::UInt32
end
function zeMemAllocDevice(hContext, device_desc, size, alignment, hDevice, pptr)
    @ccall libze_loader.zeMemAllocDevice(hContext::Ptr{Cvoid},
                                         device_desc::Ptr{ze_device_mem_alloc_desc_t},
                                         size::Csize_t, alignment::Csize_t,
                                         hDevice::Ptr{Cvoid},
                                         pptr::Ptr{Ptr{Cvoid}})::ze_result_t
end
struct ze_command_list_desc_t
    stype::UInt32
    pNext::Ptr{Cvoid}
    commandQueueGroupOrdinal::UInt32
    flags::UInt32
end
function zeCommandListCreate(hContext, hDevice, desc, phCommandList)
    @ccall libze_loader.zeCommandListCreate(hContext::Ptr{Cvoid},
                                            hDevice::Ptr{Cvoid},
                                            desc::Ptr{ze_command_list_desc_t},
                                            phCommandList::Ptr{Ptr{Cvoid}})::ze_result_t
end
function zeCommandListAppendMemoryCopy(hCommandList, dstptr, srcptr, size,
                                                hSignalEvent, numWaitEvents, phWaitEvents)
    @ccall libze_loader.zeCommandListAppendMemoryCopy(hCommandList::Ptr{Cvoid},
                                                      dstptr::Ptr{Cvoid},
                                                      srcptr::Ptr{Cvoid},
                                                      size::Csize_t,
                                                      hSignalEvent::Ptr{Cvoid},
                                                      numWaitEvents::UInt32,
                                                      phWaitEvents::Ptr{Ptr{Cvoid}})::ze_result_t
end
function zeCommandListClose(hCommandList)
    @ccall libze_loader.zeCommandListClose(hCommandList::Ptr{Cvoid})::ze_result_t
end
function zeCommandQueueExecuteCommandLists(hCommandQueue, numCommandLists,
                                                    phCommandLists, hFence)
    @ccall libze_loader.zeCommandQueueExecuteCommandLists(hCommandQueue::Ptr{Cvoid},
                                                          numCommandLists::UInt32,
                                                          phCommandLists::Ptr{Ptr{Cvoid}},
                                                          hFence::Ptr{Cvoid})::ze_result_t
end
function zeContextEvictMemory(hContext, hDevice, ptr, size)
    @ccall libze_loader.zeContextEvictMemory(hContext::Ptr{Cvoid},
                                             hDevice::Ptr{Cvoid},
                                             ptr::Ptr{Cvoid},
                                             size::Csize_t)::ze_result_t
end

# set-up API
zeInit(0)
const drv = Ref{Ptr{Cvoid}}()
zeDriverGet(Ref(UInt32(1)), drv)
const ctx = Ref{Ptr{Cvoid}}()
zeContextCreate(drv[], Ref(ze_context_desc_t(0, C_NULL, 0)), ctx)
const dev = Ref{Ptr{Cvoid}}()
zeDeviceGet(drv[], Ref(UInt32(1)), dev)
const queue = Ref{Ptr{Cvoid}}()
zeCommandQueueCreate(ctx[], dev[], Ref(ze_command_queue_desc_t(0, C_NULL, 0, 0, 0, 0, 0)), queue)

# allocate first buffer
const buf1 = Ref{Ptr{Nothing}}()
zeMemAllocDevice(ctx[], Ref(ze_device_mem_alloc_desc_t(0, C_NULL, 0, 0)), 4, 4, dev[], buf1)
const ptr1 = reinterpret(Ptr{Float32}, buf1[])

# allocate second buffer
const buf2 = Ref{Ptr{Nothing}}()
zeMemAllocDevice(ctx[], Ref(ze_device_mem_alloc_desc_t(0, C_NULL, 0, 0)), 4, 4, dev[], buf2)
const ptr2 = reinterpret(Ptr{Float32}, buf2[])

# copy from first to second buffer
const list1 = Ref{Ptr{Cvoid}}()
zeCommandListCreate(ctx[], dev[], Ref(ze_command_list_desc_t(0, C_NULL, 0, 0)), list1)
zeCommandListAppendMemoryCopy(list1[], ptr2, ptr1, 4, C_NULL, 0, C_NULL)
zeCommandListClose(list1[])
# XXX: without this function, or with --compile-min, or when moving anything
#      out of the function, the final API call works again
function f()
    zeCommandQueueExecuteCommandLists(queue[], 1, [list1[]], C_NULL)
    # XXX: adding a print statement here hides the issue
    #println(42)
    zeContextEvictMemory(ctx[], dev[], buf2[], 4)
end
using InteractiveUtils
# @code_llvm f()
@code_native f()
f()

# perform other operations
const list2 = Ref{Ptr{Cvoid}}()
zeCommandListCreate(ctx[], dev[], Ref(ze_command_list_desc_t(0, C_NULL, 0, 0)), list2)
zeCommandListClose(list2[])
@assert zeCommandQueueExecuteCommandLists(queue[], 1, [list2[]], C_NULL) == 0

I'm out of ideas here. The generated assembly looks identical, so I'm very confused why this fails on PVC...

maleadt avatar Aug 12 '24 15:08 maleadt

Debugging this is very tricky, changing even the smallest things (e.g. setting specific breakpoints in GDB) cause the reproducer to fail, even though the generated code looks identical.

One possible explanation for that, is a timing issue. And wrt. that, something seems off: is it safe to evict memory right after a call to zeCommandQueueExecuteCommandLists? The docs mention [t]he application must ensure the device is not currently referencing the memory before it is evicted, so maybe that means we need to synchronize the command queue before evicting that memory? That would be annoying, because there isn't a non-blocking version of zeContextEvictMemory (like there is for zeMemFree by passing ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_BLOCKING_FREE), which would result in stalls when the Julia GC kicks in.

@pengtu @kballeda Could you chime in on my reasoning here? Is it even necessary to call zeContextEvictMemory if anything we're doing is a zeMemFree afterwards?

EDIT: Disabling memory eviction only fixes my reduced example, not the original one...

maleadt avatar Aug 12 '24 17:08 maleadt

EDIT: Disabling memory eviction only fixes my reduced example, not the original one...

Actually, I think this does fix the issue. However, the resize tests being the last one in the array test set, I think they happen to trap another error from the preceding broadcast testset. The PVC userspace driver helpfully throws an UNKNOWN_ERROR when a previous command buffer failed to submit (which only happens late because of it calling flush when the next command buffer is processed). This is extremely painful to debug...

maleadt avatar Aug 12 '24 18:08 maleadt

Debugging this is very tricky, changing even the smallest things (e.g. setting specific breakpoints in GDB) cause the reproducer to fail, even though the generated code looks identical.

One possible explanation for that, is a timing issue. And wrt. that, something seems off: is it safe to evict memory right after a call to zeCommandQueueExecuteCommandLists? The docs mention [t]he application must ensure the device is not currently referencing the memory before it is evicted, so maybe that means we need to synchronize the command queue before evicting that memory? That would be annoying, because there isn't a non-blocking version of zeContextEvictMemory (like there is for zeMemFree by passing ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_BLOCKING_FREE), which would result in stalls when the Julia GC kicks in.

@pengtu @kballeda Could you chime in on my reasoning here? Is it even necessary to call zeContextEvictMemory if anything we're doing is a zeMemFree afterwards?

EDIT: Disabling memory eviction only fixes my reduced example, not the original one...

Level zero doesn't track the usage of the memory it is better to synchronize and evict.

kballeda avatar Aug 23 '24 04:08 kballeda

it is better to synchronize and evict

That will kill performance.

What specifically is the effect of calling zeContextEvictMemory right before zeMemFree?

maleadt avatar Aug 23 '24 05:08 maleadt

it is better to synchronize and evict

That will kill performance.

What specifically is the effect of calling zeContextEvictMemory right before zeMemFree?

Just calling zeMemFree is sufficient and you don't need to call zeContextEvictMemory

kballeda avatar Aug 30 '24 04:08 kballeda

Great, thanks for confirming.

maleadt avatar Aug 30 '24 13:08 maleadt