Run MOE 8*7b on 4*A100 gives Cuda OOM
same issue as https://github.com/NVIDIA/TensorRT-LLM/issues/1156
What do you recommend? Thanks~
torch.stack([model_params[f'model.layers.{l}.block_sparse_moe.experts.{expert}.{suffix}.weight'].detach()
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB. GPU 0 has a total capacty of 79.33 GiB of which 367.81 MiB is free. Process 193307 has 78.96 GiB memory in use. Of the allocated memory 77.73 GiB is allocated by PyTorch, and 768.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
@byshiue Do you have any suggestions? Thanks~
we encounter the same issue: build engine with tensorrt_llm v0.9.0 and tp_size=4
Traceback (most recent call last):
File "/app/tensorrt_llm/examples/run.py", line 564, in <module>
main(args)
File "/app/tensorrt_llm/examples/run.py", line 413, in main
runner = runner_cls.from_dir(**runner_kwargs)
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/runtime/model_runner_cpp.py", line 173, in from_dir
session = GptSession(config=session_config,
RuntimeError: [TensorRT-LLM][ERROR] CUDA runtime error in ::cudaMallocAsync(ptr, n, mCudaStream->get()): out of memory (/app/tensorrt_llm/cpp/tensorrt_llm/runtime/tllmBuffers.h:118)
1 0x7f60d261a695 void tensorrt_llm::common::check<cudaError>(cudaError, char const*, char const*, int) + 149
2 0x7f60d26ae854 tensorrt_llm::runtime::BufferManager::gpu(unsigned long, nvinfer1::DataType) const + 228
3 0x7f60d276e209 tensorrt_llm::runtime::TllmRuntime::TllmRuntime(void const*, unsigned long, nvinfer1::ILogger&) + 569
4 0x7f60d271cc66 tensorrt_llm::runtime::GptSession::GptSession(tensorrt_llm::runtime::GptSession::Config const&, tensorrt_llm::runtime::GptModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, void const*, unsigned long, std::shared_ptr<nvinfer1::ILogger>) + 950
5 0x7f60d606a90a /usr/local/lib/python3.10/dist-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0xba90a) [0x7f60d606a90a]
6 0x7f60d6016575 /usr/local/lib/python3.10/dist-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0x66575) [0x7f60d6016575]
7 0x7f60d5ffad3f /usr/local/lib/python3.10/dist-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0x4ad3f) [0x7f60d5ffad3f]
8 0x55bd97b0c10e python(+0x15a10e) [0x55bd97b0c10e]
9 0x55bd97b02a7b _PyObject_MakeTpCall + 603
10 0x55bd97b1aacb python(+0x168acb) [0x55bd97b1aacb]
11 0x55bd97b1b635 _PyObject_Call + 277
12 0x55bd97b17087 python(+0x165087) [0x55bd97b17087]
13 0x55bd97b02e2b python(+0x150e2b) [0x55bd97b02e2b]
14 0x7f60d5ff0c7d /usr/local/lib/python3.10/dist-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0x40c7d) [0x7f60d5ff0c7d]
15 0x55bd97b02a7b _PyObject_MakeTpCall + 603
16 0x55bd97afc150 _PyEval_EvalFrameDefault + 30112
17 0x55bd97b1a7f1 python(+0x1687f1) [0x55bd97b1a7f1]
18 0x55bd97b1b492 PyObject_Call + 290
19 0x55bd97af75d7 _PyEval_EvalFrameDefault + 10791
20 0x55bd97b0c9fc _PyFunction_Vectorcall + 124
21 0x55bd97af526d _PyEval_EvalFrameDefault + 1725
22 0x55bd97af19c6 python(+0x13f9c6) [0x55bd97af19c6]
23 0x55bd97be7256 PyEval_EvalCode + 134
24 0x55bd97c12108 python(+0x260108) [0x55bd97c12108]
25 0x55bd97c0b9cb python(+0x2599cb) [0x55bd97c0b9cb]
26 0x55bd97c11e55 python(+0x25fe55) [0x55bd97c11e55]
27 0x55bd97c11338 _PyRun_SimpleFileObject + 424
28 0x55bd97c10f83 _PyRun_AnyFileObject + 67
29 0x55bd97c03a5e Py_RunMain + 702
30 0x55bd97bda02d Py_BytesMain + 45
31 0x7f632e518d90 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x29d90) [0x7f632e518d90]
32 0x7f632e518e40 __libc_start_main + 128
33 0x55bd97bd9f25 _start + 37
we encounter the same issue: build engine with tensorrt_llm v0.9.0 and tp_size=4
Traceback (most recent call last): File "/app/tensorrt_llm/examples/run.py", line 564, in <module> main(args) File "/app/tensorrt_llm/examples/run.py", line 413, in main runner = runner_cls.from_dir(**runner_kwargs) File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/runtime/model_runner_cpp.py", line 173, in from_dir session = GptSession(config=session_config, RuntimeError: [TensorRT-LLM][ERROR] CUDA runtime error in ::cudaMallocAsync(ptr, n, mCudaStream->get()): out of memory (/app/tensorrt_llm/cpp/tensorrt_llm/runtime/tllmBuffers.h:118) 1 0x7f60d261a695 void tensorrt_llm::common::check<cudaError>(cudaError, char const*, char const*, int) + 149 2 0x7f60d26ae854 tensorrt_llm::runtime::BufferManager::gpu(unsigned long, nvinfer1::DataType) const + 228 3 0x7f60d276e209 tensorrt_llm::runtime::TllmRuntime::TllmRuntime(void const*, unsigned long, nvinfer1::ILogger&) + 569 4 0x7f60d271cc66 tensorrt_llm::runtime::GptSession::GptSession(tensorrt_llm::runtime::GptSession::Config const&, tensorrt_llm::runtime::GptModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, void const*, unsigned long, std::shared_ptr<nvinfer1::ILogger>) + 950 5 0x7f60d606a90a /usr/local/lib/python3.10/dist-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0xba90a) [0x7f60d606a90a] 6 0x7f60d6016575 /usr/local/lib/python3.10/dist-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0x66575) [0x7f60d6016575] 7 0x7f60d5ffad3f /usr/local/lib/python3.10/dist-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0x4ad3f) [0x7f60d5ffad3f] 8 0x55bd97b0c10e python(+0x15a10e) [0x55bd97b0c10e] 9 0x55bd97b02a7b _PyObject_MakeTpCall + 603 10 0x55bd97b1aacb python(+0x168acb) [0x55bd97b1aacb] 11 0x55bd97b1b635 _PyObject_Call + 277 12 0x55bd97b17087 python(+0x165087) [0x55bd97b17087] 13 0x55bd97b02e2b python(+0x150e2b) [0x55bd97b02e2b] 14 0x7f60d5ff0c7d /usr/local/lib/python3.10/dist-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0x40c7d) [0x7f60d5ff0c7d] 15 0x55bd97b02a7b _PyObject_MakeTpCall + 603 16 0x55bd97afc150 _PyEval_EvalFrameDefault + 30112 17 0x55bd97b1a7f1 python(+0x1687f1) [0x55bd97b1a7f1] 18 0x55bd97b1b492 PyObject_Call + 290 19 0x55bd97af75d7 _PyEval_EvalFrameDefault + 10791 20 0x55bd97b0c9fc _PyFunction_Vectorcall + 124 21 0x55bd97af526d _PyEval_EvalFrameDefault + 1725 22 0x55bd97af19c6 python(+0x13f9c6) [0x55bd97af19c6] 23 0x55bd97be7256 PyEval_EvalCode + 134 24 0x55bd97c12108 python(+0x260108) [0x55bd97c12108] 25 0x55bd97c0b9cb python(+0x2599cb) [0x55bd97c0b9cb] 26 0x55bd97c11e55 python(+0x25fe55) [0x55bd97c11e55] 27 0x55bd97c11338 _PyRun_SimpleFileObject + 424 28 0x55bd97c10f83 _PyRun_AnyFileObject + 67 29 0x55bd97c03a5e Py_RunMain + 702 30 0x55bd97bda02d Py_BytesMain + 45 31 0x7f632e518d90 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x29d90) [0x7f632e518d90] 32 0x7f632e518e40 __libc_start_main + 128 33 0x55bd97bd9f25 _start + 37
After tuning the build parameters, everything works fine now: --max_batch_size 64 -> 32 --max_input_len 8196 -> 4096 --max_output_len 1024 -> 512
FYI: When use triton llm backend to serving the model, I also encounter bellow error: "GPU lacks the shared memory resources to run grouped gemm kernel" But I decrease 'kv_cache_free_gpu_mem_fraction' from 0.9 to 0.8, the error disappeared
It looks the issue is resolved. Close it. Feel free to ask question again if you still have issue/question.