Halide
Halide copied to clipboard
Strange cuInit initialization error with generated halide code
Hello. I am getting a weird initialization error when running a simple code using the Halide cuda backend.
Running ./build/clang/csrc/vec_add_gpu
Run on (32 X 5480.74 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x16)
L1 Instruction 32 KiB (x16)
L2 Unified 1024 KiB (x16)
L3 Unified 32768 KiB (x2)
Load Average: 0.09, 0.14, 0.50
Error: CUDA error: <Unknown error> cuInit failed
Aborted (core dumped)
The generator of the code
#include "Halide.h"
using namespace Halide;
class VectorAddGenerator : public Halide::Generator<VectorAddGenerator>
{
public:
Input<Buffer<float>> input1{"input1", 1};
Input<Buffer<float>> input2{"input2", 1};
Output<Buffer<float>> output{"output", 1};
void generate()
{
Var x("x"), xi("xi"), xo("xo");
output(x) = input1(x) + input2(x);
if (get_target().has_gpu_feature())
{
output.gpu_tile(x, xo, xi, 256);
}
else
{
output.split(x, xo, xi, 256).parallel(xo).vectorize(xi, natural_vector_size<float>());
}
output.print_loop_nest();
}
};
HALIDE_REGISTER_GENERATOR(VectorAddGenerator, vector_add)
client code
#include <iostream>
#include <cassert>
#include "HalideBuffer.h"
#include "HalideRuntimeCuda.h"
#include "vector_add.h"
#include <benchmark/benchmark.h>
#define N 32768
class vectorAddFixture : public benchmark::Fixture
{
public:
vectorAddFixture() : input1(N), input2(N), output(N) {}
void SetUp(const benchmark::State &state)
{
input1.fill(1.0f);
input2.fill(2.0f);
input1.set_host_dirty();
input2.set_host_dirty();
}
void TearDown(const benchmark::State &state)
{
input1.device_deallocate();
input2.device_deallocate();
output.device_deallocate();
}
Halide::Runtime::Buffer<float> input1, input2, output;
};
BENCHMARK_F(vectorAddFixture, vectorAdd)
(benchmark::State &state)
{
for (auto _ : state)
{
vector_add(input1, input2, output);
output.device_sync();
}
}
BENCHMARK_MAIN();
Compiling with CMake using
add_halide_generator(hlgen_vector_add SOURCES hlgen_vector_add.cpp)
add_halide_library(vector_add FROM hlgen_vector_add
STMT_HTML ON
STMT ON
FEATURES large_buffers cuda cuda_capability_75)
add_executable(vec_add_gpu vec_add_gpu.cpp)
target_link_libraries(vec_add_gpu PRIVATE
Halide::Tools
vector_add
benchmark::benchmark
benchmark::benchmark_main)
The weirdness is that the code in some environments execute without any issue. Currently I have tested the code using a container (sometimes work, sometimes not) and directly on the OS. I also have tested on multiple hardware (V100, RTX2080, A100) currently only working on the RTX2080.
Dockerfile that I am using to test on the RTX2080 (Fedora 41)
FROM docker.io/nvidia/cuda:12.6.3-devel-ubuntu24.04
RUN apt-get update && apt-get install -y \
ninja-build \
cmake \
git \
g++ \
gcc \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
RUN git clone --depth 1 --branch llvmorg-18.1.8 https://github.com/llvm/llvm-project.git && \
cmake -G Ninja -S llvm-project/llvm -B build \
-DCMAKE_BUILD_TYPE=Release \
-DLLVM_ENABLE_PROJECTS="clang;lld" \
-DLLVM_ENABLE_RUNTIMES=compiler-rt \
-DLLVM_TARGETS_TO_BUILD="WebAssembly;X86;AArch64;ARM;Hexagon;NVPTX;PowerPC;RISCV" \
-DLLVM_ENABLE_ASSERTIONS=ON \
-DLLVM_ENABLE_EH=ON \
-DLLVM_ENABLE_RTTI=ON \
-DLLVM_ENABLE_HTTPLIB=OFF \
-DLLVM_ENABLE_LIBEDIT=OFF \
-DLLVM_ENABLE_LIBXML2=OFF \
-DLLVM_ENABLE_TERMINFO=OFF \
-DLLVM_ENABLE_ZLIB=OFF \
-DLLVM_ENABLE_ZSTD=OFF \
-DLLVM_BUILD_32_BITS=OFF \
-DCMAKE_INSTALL_PREFIX=/usr/local && \
cmake --build build && \
cmake --install build --prefix=/usr/local && \
rm -rf llvm-project build
RUN apt-get update && apt-get install -y \
libjpeg-dev \
libpng-dev \
curl \
zip \
unzip \
pkg-config \
nsight-systems \
gdb \
tar && \
rm -rf /var/lib/apt/lists/*
RUN git clone --depth 1 --branch v18.0.0 https://github.com/halide/Halide.git && \
cmake -G Ninja -S Halide -B build -DCMAKE_BUILD_TYPE=Release && \
cmake --build build && \
cmake --install build --prefix=/usr/local/halide && \
rm -rf build
RUN mkdir -p /opt/vcpkg-downloads /opt/vcpkg-binary-cache
ENV VCPKG_DEFAULT_BINARY_CACHE="/opt/vcpkg-binary-cache" \
VCPKG_DOWNLOADS="/opt/vcpkg-downloads" \
PATH="/opt/vcpkg:$PATH" \
VCPKG_FORCE_SYSTEM_BINARIES="1"
RUN cd /opt && \
git clone --recursive --depth 1 --branch 2024.11.16 https://github.com/microsoft/vcpkg.git && \
cd vcpkg && \
./bootstrap-vcpkg.sh -disableMetrics
thanks
Is this still an issue? Have you tried running this in gdb? What's the error value?