compile error when try to install mxnet on rocm platform

Open andyzhanged opened this issue 5 years ago • 0 comments

Environment info

OS: $ lsb_release -a No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 18.04.5 LTS Release: 18.04 Codename: bionic

Compiler: $ hipcc --version HIP version: 3.3.20126-2dbba46b HCC clang version 10.0.0 (/data/jenkins-workspace/compute-rocm-rel-3.3/external/hcc-tot/llvm-project/clang 1ce0fe5e88b2124494b9500817b4c2c66bdfa5aa) (based on HCC 3.1.20114-6776c83f-1ce0fe5e88b ) Target: x86_64-unknown-linux-gnu Thread model: posix InstalledDir: /opt/rocm-3.3.0/hcc/bin

Package used (Python/R/Scala/Julia): python

MXNET version: install from source.

MXNet commit hash (git rev-parse HEAD): 0cd2b0b82d269fa86e71258eb467b1e46f641b64

Python version and distribution: $ python --version Python 3.6.9

ROCM Version: 3.3.0

Error Message:

I try to install mxnet follow the guide https://github.com/ROCmSoftwarePlatform/mxnet#installation-guide.

$ make -j1  CXX=g++-6 USE_GPU=1              
Makefile:346: WARNING: could not find nvcc compiler, the specified path was: hipcc
Running CUDA_ARCH: --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906
cd /disk/zhanged/code/mxnet/3rdparty/dmlc-core; make libdmlc.a USE_SSE=1 config=/disk/zhanged/code/mxnet/make/config.mk; cd /disk/zhanged/code/mxnet
make[1]: Entering directory '/disk/zhanged/code/mxnet/3rdparty/dmlc-core'
make[1]: 'libdmlc.a' is up to date.
make[1]: Leaving directory '/disk/zhanged/code/mxnet/3rdparty/dmlc-core'
hipcc -std=c++11 -Xcompiler -D_FORCE_INLINES -g -O3   --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906 -Xcompiler "-DMSHADOW_FORCE_STREAM -Wall -Wsign-compare -O3 -DNDEBUG=1 -I. -I/opt/rocm/include -I/opt/rocm/hipblas/include -I/opt/rocm/hiprand/include -I/opt/rocm/rocfft/include -I/opt/rocm/hipcub/include/ -I/opt/rocm/rocblas/include -I/opt/rocm/rocrand/include -I/disk/zhanged/code/mxnet/3rdparty/mshadow/ -I/disk/zhanged/code/mxnet/3rdparty/dmlc-core/include -fPIC -I/disk/zhanged/code/mxnet/3rdparty/tvm/nnvm/include -I/disk/zhanged/code/mxnet/3rdparty/dlpack/include -I/disk/zhanged/code/mxnet/3rdparty/tvm/include -Iinclude -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs -msse3 -mf16c -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 -DMSHADOW_RABIT_PS=0 -DMSHADOW_DIST_PS=0 -DMSHADOW_USE_PASCAL=0 -DMXNET_USE_OPENCV=1 -I/usr/include/opencv -fopenmp -DMXNET_USE_OPERATOR_TUNING=1 -DMXNET_USE_LAPACK  -I/opt/rocm/hipcub/include/hipcub/rocprim -DMXNET_USE_RCCL=0 -DMXNET_USE_LIBJPEG_TURBO=0" -M -MT build/src/operator/nn/ctc_loss_gpu.o src/operator/nn/ctc_loss.cu >build/src/operator/nn/ctc_loss_gpu.d
clang-10: warning: argument unused during compilation: '-Xcompiler' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '--amdgpu-target=gfx801' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '--amdgpu-target=gfx802' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '--amdgpu-target=gfx803' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '--amdgpu-target=gfx900' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '--amdgpu-target=gfx906' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '-Xcompiler' [-Wunused-command-line-argument]
hipcc -c -o build/src/operator/nn/ctc_loss_gpu.o -std=c++11 -Xcompiler -D_FORCE_INLINES -g -O3   --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906 -Xcompiler "-DMSHADOW_FORCE_STREAM -Wall -Wsign-compare -O3 -DNDEBUG=1 -I. -I/opt/rocm/include -I/opt/rocm/hipblas/include -I/opt/rocm/hiprand/include -I/opt/rocm/rocfft/include -I/opt/rocm/hipcub/include/ -I/opt/rocm/rocblas/include -I/opt/rocm/rocrand/include -I/disk/zhanged/code/mxnet/3rdparty/mshadow/ -I/disk/zhanged/code/mxnet/3rdparty/dmlc-core/include -fPIC -I/disk/zhanged/code/mxnet/3rdparty/tvm/nnvm/include -I/disk/zhanged/code/mxnet/3rdparty/dlpack/include -I/disk/zhanged/code/mxnet/3rdparty/tvm/include -Iinclude -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs -msse3 -mf16c -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 -DMSHADOW_RABIT_PS=0 -DMSHADOW_DIST_PS=0 -DMSHADOW_USE_PASCAL=0 -DMXNET_USE_OPENCV=1 -I/usr/include/opencv -fopenmp -DMXNET_USE_OPERATOR_TUNING=1 -DMXNET_USE_LAPACK  -I/opt/rocm/hipcub/include/hipcub/rocprim -DMXNET_USE_RCCL=0 -DMXNET_USE_LIBJPEG_TURBO=0" src/operator/nn/ctc_loss.cu
clang-10: warning: argument unused during compilation: '-Xcompiler' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '--amdgpu-target=gfx801' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '--amdgpu-target=gfx802' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '--amdgpu-target=gfx803' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '--amdgpu-target=gfx900' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '--amdgpu-target=gfx906' [-Wunused-command-line-argument]
clang-10: warning: argument unused during compilation: '-Xcompiler' [-Wunused-command-line-argument]
In file included from src/operator/nn/ctc_loss.cu:26:
In file included from src/operator/nn/./ctc_loss-inl.h:29:
In file included from include/mxnet/operator_util.h:43:
In file included from include/mxnet/./base.h:32:
In file included from /disk/zhanged/code/mxnet/3rdparty/mshadow/mshadow/tensor.h:16:
In file included from /disk/zhanged/code/mxnet/3rdparty/mshadow/mshadow/./base.h:29:
In file included from ./hip-wrappers.h:8:
In file included from /opt/rocm/include/hip/hip_runtime.h:56:
In file included from /opt/rocm/include/hip/hcc_detail/hip_runtime.h:105:
/opt/rocm/include/hip/hcc_detail/surface_functions.h:37:18: warning: comparison of integers of different signs: 'int32_t' (aka 'int') and 'size_t' (aka 'unsigned long') [-Wsign-compare]
    if ((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0)) {
         ~~~~~~~ ^ ~~~~~
/opt/rocm/include/hip/hcc_detail/surface_functions.h:37:50: warning: comparison of integers of different signs: 'int' and 'size_t' (aka 'unsigned long') [-Wsign-compare]
    if ((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0)) {
                                               ~ ^ ~~~~~~
/opt/rocm/include/hip/hcc_detail/surface_functions.h:54:20: warning: comparison of integers of different signs: 'int32_t' (aka 'int') and 'size_t' (aka 'unsigned long') [-Wsign-compare]
    if (!((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0))) {
           ~~~~~~~ ^ ~~~~~
/opt/rocm/include/hip/hcc_detail/surface_functions.h:54:52: warning: comparison of integers of different signs: 'int' and 'size_t' (aka 'unsigned long') [-Wsign-compare]
    if (!((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0))) {
                                                 ~ ^ ~~~~~~
In file included from src/operator/nn/ctc_loss.cu:26:
In file included from src/operator/nn/./ctc_loss-inl.h:34:
src/operator/nn/./sequence_mask-inl.h:55:5: warning: misleading indentation; statement is not part of the previous 'if' [-Wmisleading-indentation]
    for (index_t s = lengths[batch]; s < smax; ++s)
    ^
src/operator/nn/./sequence_mask-inl.h:51:3: note: previous statement is here
  if (batch >= bmax)
  ^
In file included from src/operator/nn/ctc_loss.cu:26:
In file included from src/operator/nn/./ctc_loss-inl.h:35:
In file included from src/operator/nn/../sequence_op_common.h:31:
In file included from src/operator/nn/.././operator_common.h:42:
src/operator/nn/../../common/cuda_utils.h:246:11: warning: enumeration values 'HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED' and 'HIPRAND_STATUS_NOT_IMPLEMENTED' not handled in switch [-Wswitch]
  switch (status) {
          ^
In file included from src/operator/nn/ctc_loss.cu:27:
In file included from src/operator/nn/../../../3rdparty/ctc_include/detail/gpu_ctc.h:25:
In file included from src/operator/nn/../../../3rdparty/ctc_include/detail/gpu_ctc_kernels.h:23:
In file included from src/operator/nn/../../../3rdparty/ctc_include/detail/../contrib/moderngpu/include/device/ctascan.cuh:38:
src/operator/nn/../../../3rdparty/ctc_include/detail/../contrib/moderngpu/include/device/deviceutil.cuh:68:13: error: no matching function for call to 'min'
        range.x += min(block, task.y);
                   ^~~
src/operator/nn/../../../3rdparty/ctc_include/detail/../contrib/moderngpu/include/device/devicetypes.cuh:260:23: note: candidate function not viable: no known conversion from 'int' to 'int2' (aka 'HIP_vector_type<int, 2>') for 1st argument
MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
                      ^
src/operator/nn/../../../3rdparty/ctc_include/detail/../contrib/moderngpu/include/device/devicetypes.cuh:243:20: note: candidate template ignored: deduced conflicting types for parameter 'T' ('int' vs. 'hip_impl::Scalar_accessor<int, int __attribute__((ext_vector_type(2))), 1>')
MGPU_HOST_DEVICE T min(T a, T b) {
                   ^
In file included from src/operator/nn/ctc_loss.cu:26:
In file included from src/operator/nn/./ctc_loss-inl.h:29:
In file included from include/mxnet/operator_util.h:43:
In file included from include/mxnet/base.h:32:
In file included from /disk/zhanged/code/mxnet/3rdparty/mshadow/mshadow/./cuda/../tensor.h:16:
In file included from /disk/zhanged/code/mxnet/3rdparty/mshadow/mshadow/base.h:29:
In file included from ./hip-wrappers.h:8:
In file included from /opt/rocm/include/hip/hip_runtime.h:56:
In file included from /opt/rocm/include/hip/hcc_detail/hip_runtime.h:57:
In file included from /opt/rocm/include/hip/hip_runtime_api.h:348:
In file included from /opt/rocm/include/hip/hcc_detail/hip_runtime_api.h:44:
In file included from /opt/rocm/include/hip/hcc_detail/hip_texture_types.h:38:
In file included from /opt/rocm/include/hip/hcc_detail/channel_descriptor.h:28:
/opt/rocm/include/hip/hcc_detail/hip_vector_types.h:176:22: warning: unused variable 'r' [-Wunused-variable]
                auto r{data[idx]};
                     ^
/opt/rocm/rocrand/include/rocrand_philox4x32_10.h:284:26: note: in instantiation of member function 'hip_impl::Scalar_accessor<unsigned int, unsigned int __attribute__((ext_vector_type(4))), 0>::operator++' requested here
        m_state.counter.x++;
                         ^
7 warnings and 1 error generated.
In file included from src/operator/nn/ctc_loss.cu:26:
In file included from src/operator/nn/./ctc_loss-inl.h:29:
In file included from include/mxnet/operator_util.h:43:
In file included from include/mxnet/./base.h:32:
In file included from /disk/zhanged/code/mxnet/3rdparty/mshadow/mshadow/tensor.h:16:
In file included from /disk/zhanged/code/mxnet/3rdparty/mshadow/mshadow/./base.h:29:
In file included from ./hip-wrappers.h:8:
In file included from /opt/rocm/include/hip/hip_runtime.h:56:
In file included from /opt/rocm/include/hip/hcc_detail/hip_runtime.h:105:
/opt/rocm/include/hip/hcc_detail/surface_functions.h:37:18: warning: comparison of integers of different signs: 'int32_t' (aka 'int') and 'size_t' (aka 'unsigned long') [-Wsign-compare]
    if ((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0)) {
         ~~~~~~~ ^ ~~~~~
/opt/rocm/include/hip/hcc_detail/surface_functions.h:37:50: warning: comparison of integers of different signs: 'int' and 'size_t' (aka 'unsigned long') [-Wsign-compare]
    if ((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0)) {
                                               ~ ^ ~~~~~~
/opt/rocm/include/hip/hcc_detail/surface_functions.h:54:20: warning: comparison of integers of different signs: 'int32_t' (aka 'int') and 'size_t' (aka 'unsigned long') [-Wsign-compare]
    if (!((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0))) {
           ~~~~~~~ ^ ~~~~~
/opt/rocm/include/hip/hcc_detail/surface_functions.h:54:52: warning: comparison of integers of different signs: 'int' and 'size_t' (aka 'unsigned long') [-Wsign-compare]
    if (!((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0))) {
                                                 ~ ^ ~~~~~~
In file included from src/operator/nn/ctc_loss.cu:26:
In file included from src/operator/nn/./ctc_loss-inl.h:34:
src/operator/nn/./sequence_mask-inl.h:55:5: warning: misleading indentation; statement is not part of the previous 'if' [-Wmisleading-indentation]
    for (index_t s = lengths[batch]; s < smax; ++s)
    ^
src/operator/nn/./sequence_mask-inl.h:51:3: note: previous statement is here
  if (batch >= bmax)
  ^
In file included from src/operator/nn/ctc_loss.cu:26:
In file included from src/operator/nn/./ctc_loss-inl.h:35:
In file included from src/operator/nn/../sequence_op_common.h:31:
In file included from src/operator/nn/.././operator_common.h:42:
src/operator/nn/../../common/cuda_utils.h:246:11: warning: enumeration values 'HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED' and 'HIPRAND_STATUS_NOT_IMPLEMENTED' not handled in switch [-Wswitch]
  switch (status) {
          ^
In file included from src/operator/nn/ctc_loss.cu:27:
In file included from src/operator/nn/../../../3rdparty/ctc_include/detail/gpu_ctc.h:25:
In file included from src/operator/nn/../../../3rdparty/ctc_include/detail/gpu_ctc_kernels.h:23:
In file included from src/operator/nn/../../../3rdparty/ctc_include/detail/../contrib/moderngpu/include/device/ctascan.cuh:38:
src/operator/nn/../../../3rdparty/ctc_include/detail/../contrib/moderngpu/include/device/deviceutil.cuh:68:13: error: no matching function for call to 'min'
        range.x += min(block, task.y);
                   ^~~
src/operator/nn/../../../3rdparty/ctc_include/detail/../contrib/moderngpu/include/device/devicetypes.cuh:260:23: note: candidate function not viable: no known conversion from 'int' to 'int2' (aka 'HIP_vector_type<int, 2>') for 1st argument
MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
                      ^
src/operator/nn/../../../3rdparty/ctc_include/detail/../contrib/moderngpu/include/device/devicetypes.cuh:243:20: note: candidate template ignored: deduced conflicting types for parameter 'T' ('int' vs. 'hip_impl::Scalar_accessor<int, int __attribute__((ext_vector_type(2))), 1>')
MGPU_HOST_DEVICE T min(T a, T b) {
                   ^
In file included from src/operator/nn/ctc_loss.cu:26:
In file included from src/operator/nn/./ctc_loss-inl.h:29:
In file included from include/mxnet/operator_util.h:43:
In file included from include/mxnet/base.h:32:
In file included from /disk/zhanged/code/mxnet/3rdparty/mshadow/mshadow/./cuda/../tensor.h:16:
In file included from /disk/zhanged/code/mxnet/3rdparty/mshadow/mshadow/base.h:29:
In file included from ./hip-wrappers.h:8:
In file included from /opt/rocm/include/hip/hip_runtime.h:56:
In file included from /opt/rocm/include/hip/hcc_detail/hip_runtime.h:57:
In file included from /opt/rocm/include/hip/hip_runtime_api.h:348:
In file included from /opt/rocm/include/hip/hcc_detail/hip_runtime_api.h:44:
In file included from /opt/rocm/include/hip/hcc_detail/hip_texture_types.h:38:
In file included from /opt/rocm/include/hip/hcc_detail/channel_descriptor.h:28:
/opt/rocm/include/hip/hcc_detail/hip_vector_types.h:176:22: warning: unused variable 'r' [-Wunused-variable]
                auto r{data[idx]};
                     ^
/opt/rocm/rocrand/include/rocrand_philox4x32_10.h:284:26: note: in instantiation of member function 'hip_impl::Scalar_accessor<unsigned int, unsigned int __attribute__((ext_vector_type(4))), 0>::operator++' requested here
        m_state.counter.x++;
                         ^
7 warnings and 1 error generated.
Makefile:507: recipe for target 'build/src/operator/nn/ctc_loss_gpu.o' failed
make: *** [build/src/operator/nn/ctc_loss_gpu.o] Error 1

What have i tried to solve it?

The compiler has confused about min function. So I try to modify the code as following show:

diff --git a/3rdparty/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh
index e18807f38..fb4f08f21 100644
--- a/3rdparty/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh
+++ b/3rdparty/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh
@@ -65,7 +65,7 @@ MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
 MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
        int2 range;
        range.x = task.x * block;
-       range.x += min(block, task.y);
+       range.x += min(block, (int)task.y);
        range.y = range.x + task.x + (block < task.y);
        return range;
 }

After the modify, Mxnet build success and the demo bdk_demo.py run success on my vega20 card. So the reason is int2 type or template function min. Is some one has some idea? thanks.

Jan 05 '21 06:01 andyzhanged