[Kaldi] cudaError_t 400

Open LinaKim93 opened this issue 2 years ago • 0 comments

Hello, I have a fatal error at kaldi tritonserver.

Related to Kaldi

Describe the bug

ERROR ([5.5]:SynchronizeGpu():cu-device.cc:638) cudaError_t 400 : "invalid resource handle" returned from 'cudaGetLastError()'

[ Stack-Trace: ]
/opt/kaldi/src/lib/libkaldi-base.so(kaldi::MessageLogger::LogMessage() const+0x793) [0x7f94688541c3]
/workspace/model-repo/kaldi_online/1/libtriton_kaldi.so(kaldi::MessageLogger::LogAndThrow::operator=(kaldi::MessageLogger const&)+0x2a) [0x7f945eee510a]
/opt/kaldi/src/lib/libkaldi-cudamatrix.so(kaldi::SynchronizeGpu()+0xf6) [0x7f945dab1beb]
/opt/kaldi/src/lib/libkaldi-cudamatrix.so(kaldi::CuMemoryAllocator::MallocPitch(unsigned long, unsigned long, unsigned long*)+0x51e) [0x7f945db2a846]
/opt/kaldi/src/lib/libkaldi-cudamatrix.so(kaldi::CuMatrix<float>::Resize(int, int, kaldi::MatrixResizeType, kaldi::MatrixStrideType)+0x2b1) [0x7f945dae4325]
/opt/kaldi/src/lib/libkaldi-cudadecoder.so(kaldi::cuda_decoder::BatchedStaticNnet3::RunNnet3(kaldi::CuMatrix<float>*, int)+0x679) [0x7f945ebb14e9]
/opt/kaldi/src/lib/libkaldi-cudadecoder.so(kaldi::cuda_decoder::BatchedStaticNnet3::RunBatch(std::vector<int, std::allocator<int> > const&, std::vector<float*, std::allocator<float*> > const&, int, std::vector<float*, std::allocator<float*> > const&, std::vector<int, std::allocator<int> > const&, std::vector<bool, std::allocator<bool> > const&, std::vector<bool, std::allocator<bool> > const&, kaldi::CuMatrix<float>*, std::vector<std::vector<std::pair<int, float*>, std::allocator<std::pair<int, float*> > >, std::allocator<std::vector<std::pair<int, float*>, std::allocator<std::pair<int, float*> > > > >*)+0x270) [0x7f945ebb1962]
/opt/kaldi/src/lib/libkaldi-cudadecoder.so(kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipeline::RunNnet3(std::vector<int, std::allocator<int> > const&, std::vector<float*, std::allocator<float*> > const&, int, std::vector<int, std::allocator<int> > const&, std::vector<bool, std::allocator<bool> > const&, std::vector<bool, std::allocator<bool> > const&, std::vector<float*, std::allocator<float*> > const&)+0x36) [0x7f945eb7e8b4]
/opt/kaldi/src/lib/libkaldi-cudadecoder.so(kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<float*, std::allocator<float*> > const&, int, std::vector<int, std::allocator<int> > const&, std::vector<float*, std::allocator<float*> > const&, std::vector<bool, std::allocator<bool> > const&, std::vector<bool, std::allocator<bool> > const&, std::vector<int, std::allocator<int> >*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*> >*, std::vector<bool, std::allocator<bool> >*)+0xd7) [0x7f945eb8070d]
/opt/kaldi/src/lib/libkaldi-cudadecoder.so(kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(std::vector<unsigned long, std::allocator<unsigned long> > const&, kaldi::Matrix<float> const&, std::vector<int, std::allocator<int> > const&, std::vector<bool, std::allocator<bool> > const&, std::vector<bool, std::allocator<bool> > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*> >*, std::vector<bool, std::allocator<bool> >*)+0xce) [0x7f945eb80888]
/opt/kaldi/src/lib/libkaldi-cudadecoder.so(kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<kaldi::SubVector<float>, std::allocator<kaldi::SubVector<float> > > const&, std::vector<bool, std::allocator<bool> > const&, std::vector<bool, std::allocator<bool> > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*> >*, std::vector<bool, std::allocator<bool> >*)+0xa7) [0x7f945eb809b7]
/workspace/model-repo/kaldi_online/1/libtriton_kaldi.so(TRITONBACKEND_ModelInstanceExecute+0xddd) [0x7f945eee137d]
/opt/tritonserver/lib/libtritonserver.so(+0x2ec1f7) [0x7f94d1af01f7]
/opt/tritonserver/lib/libtritonserver.so(+0xfc640) [0x7f94d1900640]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xd6de4) [0x7f94d1359de4]
/usr/lib/x86_64-linux-gnu/libpthread.so.0(+0x9609) [0x7f94d17d7609]
/usr/lib/x86_64-linux-gnu/libc.so.6(clone+0x43) [0x7f94d1047293]

To Reproduce Steps to reproduce the behavior: https://developer.nvidia.com/blog/integrating-nvidia-triton-inference-server-with-kaldi-asr/

Install triton
Launch triton server
Launch triton client

Environment Please provide at least:

nvcr.io/nvidia/kaldi:21.08-py3, nvcr.io/nvidia/tritonserver:21.05-py3, nvcr.io/nvidia/tritonserver:21.05-py3-sdk
P5000 * 2
525.60.11

Apr 17 '23 06:04 LinaKim93