Strange cuInit initialization error with generated halide code

Open aavbsouza opened this issue 1 year ago • 1 comments

Hello. I am getting a weird initialization error when running a simple code using the Halide cuda backend.

Running ./build/clang/csrc/vec_add_gpu
Run on (32 X 5480.74 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x16)
  L1 Instruction 32 KiB (x16)
  L2 Unified 1024 KiB (x16)
  L3 Unified 32768 KiB (x2)
Load Average: 0.09, 0.14, 0.50
Error: CUDA error: <Unknown error> cuInit failed
Aborted (core dumped)

The generator of the code

#include "Halide.h"

using namespace Halide;

class VectorAddGenerator : public Halide::Generator<VectorAddGenerator>
{
public:
    Input<Buffer<float>> input1{"input1", 1};
    Input<Buffer<float>> input2{"input2", 1};
    Output<Buffer<float>> output{"output", 1};

    void generate()
    {
        Var x("x"), xi("xi"), xo("xo");

        output(x) = input1(x) + input2(x);

        if (get_target().has_gpu_feature())
        {
            output.gpu_tile(x, xo, xi, 256);
        }
        else
        {
            output.split(x, xo, xi, 256).parallel(xo).vectorize(xi, natural_vector_size<float>());
        }
        output.print_loop_nest();
    }
};

HALIDE_REGISTER_GENERATOR(VectorAddGenerator, vector_add)

client code

#include <iostream>
#include <cassert>

#include "HalideBuffer.h"
#include "HalideRuntimeCuda.h"
#include "vector_add.h"

#include <benchmark/benchmark.h>

#define N 32768

class vectorAddFixture : public benchmark::Fixture
{

public:
    vectorAddFixture() : input1(N), input2(N), output(N) {}
    void SetUp(const benchmark::State &state)
    {
        input1.fill(1.0f);
        input2.fill(2.0f);

        input1.set_host_dirty();
        input2.set_host_dirty();
    }

    void TearDown(const benchmark::State &state)
    {
        input1.device_deallocate();
        input2.device_deallocate();
        output.device_deallocate();
        
    }
    Halide::Runtime::Buffer<float> input1, input2, output;
};

BENCHMARK_F(vectorAddFixture, vectorAdd)
(benchmark::State &state)
{
    for (auto _ : state)
    {
        vector_add(input1, input2, output);
        output.device_sync();
    }
}

BENCHMARK_MAIN();

Compiling with CMake using

add_halide_generator(hlgen_vector_add SOURCES hlgen_vector_add.cpp)
add_halide_library(vector_add FROM hlgen_vector_add
                    STMT_HTML ON
                    STMT ON
                    FEATURES large_buffers cuda cuda_capability_75)

add_executable(vec_add_gpu vec_add_gpu.cpp)
target_link_libraries(vec_add_gpu PRIVATE 
                        Halide::Tools
                        vector_add
                        benchmark::benchmark
                        benchmark::benchmark_main)

The weirdness is that the code in some environments execute without any issue. Currently I have tested the code using a container (sometimes work, sometimes not) and directly on the OS. I also have tested on multiple hardware (V100, RTX2080, A100) currently only working on the RTX2080.

Dockerfile that I am using to test on the RTX2080 (Fedora 41)

FROM docker.io/nvidia/cuda:12.6.3-devel-ubuntu24.04

RUN apt-get update && apt-get install -y \
    ninja-build \
    cmake       \
    git         \
    g++         \
    gcc         \
    python3-pip \
    && rm -rf /var/lib/apt/lists/*

RUN git clone --depth 1 --branch llvmorg-18.1.8 https://github.com/llvm/llvm-project.git && \
        cmake -G Ninja -S llvm-project/llvm -B build \
        -DCMAKE_BUILD_TYPE=Release \
        -DLLVM_ENABLE_PROJECTS="clang;lld" \
        -DLLVM_ENABLE_RUNTIMES=compiler-rt \
        -DLLVM_TARGETS_TO_BUILD="WebAssembly;X86;AArch64;ARM;Hexagon;NVPTX;PowerPC;RISCV" \
        -DLLVM_ENABLE_ASSERTIONS=ON \
        -DLLVM_ENABLE_EH=ON \
        -DLLVM_ENABLE_RTTI=ON \
        -DLLVM_ENABLE_HTTPLIB=OFF \
        -DLLVM_ENABLE_LIBEDIT=OFF \
        -DLLVM_ENABLE_LIBXML2=OFF \
        -DLLVM_ENABLE_TERMINFO=OFF \
        -DLLVM_ENABLE_ZLIB=OFF \
        -DLLVM_ENABLE_ZSTD=OFF \
        -DLLVM_BUILD_32_BITS=OFF \
        -DCMAKE_INSTALL_PREFIX=/usr/local && \
        cmake --build build && \
        cmake --install build --prefix=/usr/local && \
        rm -rf llvm-project build

RUN apt-get update && apt-get install -y \
        libjpeg-dev \
        libpng-dev  \
        curl        \
        zip         \
        unzip       \
        pkg-config  \
        nsight-systems \
        gdb            \
        tar      && \
        rm -rf /var/lib/apt/lists/*

RUN git clone --depth 1 --branch v18.0.0 https://github.com/halide/Halide.git && \
    cmake -G Ninja -S Halide -B build -DCMAKE_BUILD_TYPE=Release && \
    cmake --build build && \
    cmake --install build --prefix=/usr/local/halide && \
    rm -rf build

RUN mkdir -p /opt/vcpkg-downloads  /opt/vcpkg-binary-cache

ENV VCPKG_DEFAULT_BINARY_CACHE="/opt/vcpkg-binary-cache" \
    VCPKG_DOWNLOADS="/opt/vcpkg-downloads" \
    PATH="/opt/vcpkg:$PATH" \
    VCPKG_FORCE_SYSTEM_BINARIES="1"

RUN cd /opt && \
    git clone --recursive --depth 1 --branch 2024.11.16  https://github.com/microsoft/vcpkg.git && \
    cd vcpkg && \
    ./bootstrap-vcpkg.sh -disableMetrics

thanks

Dec 16 '24 23:12 aavbsouza

Is this still an issue? Have you tried running this in gdb? What's the error value?

Feb 14 '25 17:02 mcourteaux