GPUCompiler.jl
GPUCompiler.jl copied to clipboard
Native execution: sret broken on M1
LazyCodegen: Test Failed at /Users/tim/Julia/pkg/GPUCompiler/test/native.jl:341
Expression: call_delayed(complex, 1.0, 2.0) == 1.0 + 2.0im
Evaluated: 5.0e-324 + 5.3532846314e-314im == 1.0 + 2.0im
Even though the LLVM IR looks identical:
julia> native_code_llvm(call_delayed, Tuple{typeof(complex), Float64, Float64})
; @ /Users/tim/Julia/pkg/GPUCompiler/test/definitions/native.jl:308 within `call_delayed`
; Function Attrs: alwaysinline
define void @julia_call_delayed_1843([2 x double]* noalias nocapture sret([2 x double]) %0, double %1, double %2) local_unnamed_addr #0 {
top:
; @ /Users/tim/Julia/pkg/GPUCompiler/test/definitions/native.jl:312 within `call_delayed`
; ┌ @ /Users/tim/Julia/pkg/GPUCompiler/test/definitions/native.jl:246 within `abi_call`
; │┌ @ /Users/tim/Julia/pkg/GPUCompiler/test/definitions/native.jl:303 within `macro expansion`
; ││┌ @ refvalue.jl:39 within `unsafe_convert`
; │││┌ @ array.jl:157 within `allocatedinline`
%3 = call i32 inttoptr (i64 4339333660 to i32 ({}*)*)({}* nonnull inttoptr (i64 5153111696 to {}*))
; └└└└
%.sroa.0.0..sroa_idx = getelementptr inbounds [2 x double], [2 x double]* %0, i64 0, i64 0
store double %1, double* %.sroa.0.0..sroa_idx, align 8
%.sroa.2.0..sroa_idx2 = getelementptr inbounds [2 x double], [2 x double]* %0, i64 0, i64 1
store double %2, double* %.sroa.2.0..sroa_idx2, align 8
ret void
}
julia> code_llvm(complex, Tuple{Float64, Float64})
; @ complex.jl:171 within `complex`
define void @julia_complex_2051([2 x double]* noalias nocapture sret([2 x double]) %0, double %1, double %2) #0 {
top:
%.sroa.0.0..sroa_idx = getelementptr inbounds [2 x double], [2 x double]* %0, i64 0, i64 0
store double %1, double* %.sroa.0.0..sroa_idx, align 8
%.sroa.2.0..sroa_idx1 = getelementptr inbounds [2 x double], [2 x double]* %0, i64 0, i64 1
store double %2, double* %.sroa.2.0..sroa_idx1, align 8
ret void
}
Does work under Rosetta2 on the same system.