TensorRT-LLM CUDA runtime error in cublasSetWorkspace(getCublasHandle(), mCublasWorkspace, workspaceSize): CUBLAS_STATUS_INTERNAL

When I used tensorrtllm as the backend for triton, I configured the following parameters:

instance_group [
  {
      count: 2
      kind: KIND_GPU
      gpus: [0]
  }
]

Triton starts normally and works normally when requested by a single process. But when I use multiple processes to issue multiple requests at the same time, I get an error:

terminate called after throwing an instance of 'tensorrt_llm::common::TllmException'
  what():  [TensorRT-LLM][ERROR] CUDA runtime error in cublasSetWorkspace(getCublasHandle(), mCublasWorkspace, workspaceSize): CUBLAS_STATUS_INTERNAL_ERROR (/tmp/tritonbuild/tensorrtllm/tensorrt_llm/cpp/tensorrt_llm/common/cublasMMWrapper.cpp:148)
1       0x7f30d8659dfe void tensorrt_llm::common::check<cublasStatus_t>(cublasStatus_t, char const*, char const*, int) + 174
2       0x7f30d8658415 tensorrt_llm::common::CublasMMWrapper::Gemm(cublasOperation_t, cublasOperation_t, int, int, int, void const*, int, void const*, int, void*, int, float, float, cublasLtMatmulAlgo_t const&, bool, bool) + 309
3       0x7f30d865895b tensorrt_llm::common::CublasMMWrapper::Gemm(cublasOperation_t, cublasOperation_t, int, int, int, void const*, int, void const*, int, void*, int, std::optional<cublasLtMatmulHeuristicResult_t> const&) + 123
4       0x7f30586b160d /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so.9(+0x16560d) [0x7f30586b160d]
5       0x7f30586b1f9a tensorrt_llm::plugins::GemmPlugin::enqueue(nvinfer1::PluginTensorDesc const*, nvinfer1::PluginTensorDesc const*, void const* const*, void* const*, void*, CUstream_st*) + 346
6       0x7f30140fd5e9 /usr/local/tensorrt/lib/libnvinfer.so.9(+0x10e45e9) [0x7f30140fd5e9]
7       0x7f30140c16af /usr/local/tensorrt/lib/libnvinfer.so.9(+0x10a86af) [0x7f30140c16af]
8       0x7f30140c3320 /usr/local/tensorrt/lib/libnvinfer.so.9(+0x10aa320) [0x7f30140c3320]
9       0x7f30d88471bb tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeContext(int) + 59
10      0x7f30d884784f tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(std::map<unsigned long, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > >&, std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<unsigned long, std::allocator<unsigned long> > const&) + 735
11      0x7f30d8853e0a tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forward(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) + 4218
12      0x7f30d8803fde tensorrt_llm::batch_manager::GptManager::step(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&, std::set<unsigned long, std::less<unsigned long>, std::allocator<unsigned long> >&) + 62
13      0x7f30d8809ac7 tensorrt_llm::batch_manager::GptManager::decoupled_execution_loop() + 247
14      0x7f313eddc253 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xdc253) [0x7f313eddc253]
15      0x7f313eb6bac3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7f313eb6bac3]
16      0x7f313ebfd850 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x126850) [0x7f313ebfd850]
Signal (6) received.
terminate called recursively
Signal (6) received.
 0# 0x0000556B33C6B04D in tritonserver
 1# 0x00007F313EB19520 in /usr/lib/x86_64-linux-gnu/libc.so.6
 2# pthread_kill in /usr/lib/x86_64-linux-gnu/libc.so.6
 3# raise in /usr/lib/x86_64-linux-gnu/libc.so.6
 4# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
 5# 0x00007F313EDA2B9E in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 6# 0x00007F313EDAE20C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 7# 0x00007F313EDAD1E9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 8# __gxx_personality_v0 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 9# 0x00007F3140F88884 in /usr/lib/x86_64-linux-gnu/libgcc_s.so.1
10# _Unwind_Resume in /usr/lib/x86_64-linux-gnu/libgcc_s.so.1
11# 0x00007F30D8626BC0 in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
12# tensorrt_llm::common::CublasMMWrapper::Gemm(cublasOperation_t, cublasOperation_t, int, int, int, void const*, int, void const*, int, void*, int, std::optional<cublasLtMatmulHeuristicResult_t> const&) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
13# 0x00007F30586B160D in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so.9
14# tensorrt_llm::plugins::GemmPlugin::enqueue(nvinfer1::PluginTensorDesc const*, nvinfer1::PluginTensorDesc const*, void const* const*, void* const*, void*, CUstream_st*) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so.9
15# 0x00007F30140FD5E9 in /usr/local/tensorrt/lib/libnvinfer.so.9
16# 0x00007F30140C16AF in /usr/local/tensorrt/lib/libnvinfer.so.9
17# 0x00007F30140C3320 in /usr/local/tensorrt/lib/libnvinfer.so.9
18# tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeContext(int) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
19# tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(std::map<unsigned long, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > >&, std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<unsigned long, std::allocator<unsigned long> > const&) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
20# tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forward(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
21# tensorrt_llm::batch_manager::GptManager::step(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&, std::set<unsigned long, std::less<unsigned long>, std::allocator<unsigned long> >&) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
22# tensorrt_llm::batch_manager::GptManager::decoupled_execution_loop() in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
23# 0x00007F313EDDC253 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
24# 0x00007F313EB6BAC3 in /usr/lib/x86_64-linux-gnu/libc.so.6
25# 0x00007F313EBFD850 in /usr/lib/x86_64-linux-gnu/libc.so.6

Aborted (core dumped)

Sep 07 '24 05:09 YuanWind

This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days."

Oct 08 '24 02:10 github-actions[bot]

This issue was closed because it has been stalled for 15 days with no activity.

Oct 24 '24 02:10 github-actions[bot]