TensorRT-LLM icon indicating copy to clipboard operation
TensorRT-LLM copied to clipboard

CUDA runtime error in cublasSetWorkspace(getCublasHandle(), mCublasWorkspace, workspaceSize): CUBLAS_STATUS_INTERNAL_ERROR

Open YuanWind opened this issue 1 year ago • 1 comments

When I used tensorrtllm as the backend for triton, I configured the following parameters:

instance_group [
  {
      count: 2
      kind: KIND_GPU
      gpus: [0]
  }
]

Triton starts normally and works normally when requested by a single process. But when I use multiple processes to issue multiple requests at the same time, I get an error:

terminate called after throwing an instance of 'tensorrt_llm::common::TllmException'
  what():  [TensorRT-LLM][ERROR] CUDA runtime error in cublasSetWorkspace(getCublasHandle(), mCublasWorkspace, workspaceSize): CUBLAS_STATUS_INTERNAL_ERROR (/tmp/tritonbuild/tensorrtllm/tensorrt_llm/cpp/tensorrt_llm/common/cublasMMWrapper.cpp:148)
1       0x7f30d8659dfe void tensorrt_llm::common::check<cublasStatus_t>(cublasStatus_t, char const*, char const*, int) + 174
2       0x7f30d8658415 tensorrt_llm::common::CublasMMWrapper::Gemm(cublasOperation_t, cublasOperation_t, int, int, int, void const*, int, void const*, int, void*, int, float, float, cublasLtMatmulAlgo_t const&, bool, bool) + 309
3       0x7f30d865895b tensorrt_llm::common::CublasMMWrapper::Gemm(cublasOperation_t, cublasOperation_t, int, int, int, void const*, int, void const*, int, void*, int, std::optional<cublasLtMatmulHeuristicResult_t> const&) + 123
4       0x7f30586b160d /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so.9(+0x16560d) [0x7f30586b160d]
5       0x7f30586b1f9a tensorrt_llm::plugins::GemmPlugin::enqueue(nvinfer1::PluginTensorDesc const*, nvinfer1::PluginTensorDesc const*, void const* const*, void* const*, void*, CUstream_st*) + 346
6       0x7f30140fd5e9 /usr/local/tensorrt/lib/libnvinfer.so.9(+0x10e45e9) [0x7f30140fd5e9]
7       0x7f30140c16af /usr/local/tensorrt/lib/libnvinfer.so.9(+0x10a86af) [0x7f30140c16af]
8       0x7f30140c3320 /usr/local/tensorrt/lib/libnvinfer.so.9(+0x10aa320) [0x7f30140c3320]
9       0x7f30d88471bb tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeContext(int) + 59
10      0x7f30d884784f tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(std::map<unsigned long, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > >&, std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<unsigned long, std::allocator<unsigned long> > const&) + 735
11      0x7f30d8853e0a tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forward(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) + 4218
12      0x7f30d8803fde tensorrt_llm::batch_manager::GptManager::step(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&, std::set<unsigned long, std::less<unsigned long>, std::allocator<unsigned long> >&) + 62
13      0x7f30d8809ac7 tensorrt_llm::batch_manager::GptManager::decoupled_execution_loop() + 247
14      0x7f313eddc253 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xdc253) [0x7f313eddc253]
15      0x7f313eb6bac3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7f313eb6bac3]
16      0x7f313ebfd850 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x126850) [0x7f313ebfd850]
Signal (6) received.
terminate called recursively
Signal (6) received.
 0# 0x0000556B33C6B04D in tritonserver
 1# 0x00007F313EB19520 in /usr/lib/x86_64-linux-gnu/libc.so.6
 2# pthread_kill in /usr/lib/x86_64-linux-gnu/libc.so.6
 3# raise in /usr/lib/x86_64-linux-gnu/libc.so.6
 4# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
 5# 0x00007F313EDA2B9E in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 6# 0x00007F313EDAE20C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 7# 0x00007F313EDAD1E9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 8# __gxx_personality_v0 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 9# 0x00007F3140F88884 in /usr/lib/x86_64-linux-gnu/libgcc_s.so.1
10# _Unwind_Resume in /usr/lib/x86_64-linux-gnu/libgcc_s.so.1
11# 0x00007F30D8626BC0 in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
12# tensorrt_llm::common::CublasMMWrapper::Gemm(cublasOperation_t, cublasOperation_t, int, int, int, void const*, int, void const*, int, void*, int, std::optional<cublasLtMatmulHeuristicResult_t> const&) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
13# 0x00007F30586B160D in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so.9
14# tensorrt_llm::plugins::GemmPlugin::enqueue(nvinfer1::PluginTensorDesc const*, nvinfer1::PluginTensorDesc const*, void const* const*, void* const*, void*, CUstream_st*) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so.9
15# 0x00007F30140FD5E9 in /usr/local/tensorrt/lib/libnvinfer.so.9
16# 0x00007F30140C16AF in /usr/local/tensorrt/lib/libnvinfer.so.9
17# 0x00007F30140C3320 in /usr/local/tensorrt/lib/libnvinfer.so.9
18# tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeContext(int) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
19# tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(std::map<unsigned long, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > >&, std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<unsigned long, std::allocator<unsigned long> > const&) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
20# tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forward(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
21# tensorrt_llm::batch_manager::GptManager::step(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&, std::set<unsigned long, std::less<unsigned long>, std::allocator<unsigned long> >&) in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
22# tensorrt_llm::batch_manager::GptManager::decoupled_execution_loop() in /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so
23# 0x00007F313EDDC253 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
24# 0x00007F313EB6BAC3 in /usr/lib/x86_64-linux-gnu/libc.so.6
25# 0x00007F313EBFD850 in /usr/lib/x86_64-linux-gnu/libc.so.6

Aborted (core dumped)

YuanWind avatar Sep 07 '24 05:09 YuanWind

This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days."

github-actions[bot] avatar Oct 08 '24 02:10 github-actions[bot]

This issue was closed because it has been stalled for 15 days with no activity.

github-actions[bot] avatar Oct 24 '24 02:10 github-actions[bot]