flow matching tensorRT组batch推理,和单batch推理结果不一致。
对导出onnx和tensorRT的代码做了修改:
测试代码
import sys
import torchaudio
import torch
import tensorrt as trt
flow_decoder_estimator_model = "pretrained_models/CosyVoice2-0.5B-dynamic_batch/flow.decoder.estimator.fp16.mygpu.plan"
with open(flow_decoder_estimator_model, 'rb') as f:
estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
if estimator_engine is None:
raise ValueError('failed to load trt {}'.format(flow_decoder_estimator_model))
estimator = estimator_engine.create_execution_context()
x1 = torch.load("test1/x_1.pt") # 2 x 80 x 552
mask1 = torch.load("test1/mask_1.pt") # 2 x 1 x 552
mu1 = torch.load("test1/mu_1.pt") # 2 x 80 x 552
t1 = torch.load("test1/t_1.pt") # 2
spks1 = torch.load("test1/spks_1.pt") # 2 x 80
cond1 = torch.load("test1/cond_1.pt") # 2 x 80 x 552
x2 = torch.load("test2/x_1.pt") # 2 x 80 x 552
mask2 = torch.load("test2/mask_1.pt") # 2 x 1 x 552
mu2 = torch.load("test2/mu_1.pt") # 2 x 80 x 552
t2 = torch.load("test2/t_1.pt") # 2
spks2 = torch.load("test2/spks_1.pt") # 2 x 80
cond2 = torch.load("test2/cond_1.pt") # 2 x 80 x 552
x = torch.zeros((4, 80, 568)).to(x1)
mask = torch.zeros((4, 1, 568)).to(x1)
mu = torch.zeros((4, 80, 568)).to(x1)
t = torch.zeros((4)).to(x1)
spks = torch.zeros((4, 80)).to(x1)
cond = torch.zeros((4, 80, 568)).to(x1)
x[:2, :, :x1.shape[2]] = x1
x[2:, :, :x2.shape[2]] = x2
mask[:2, :, :mask1.shape[2]] = mask1
mask[2:, :, :mask2.shape[2]] = mask2
mu[:2, :, :mu1.shape[2]] = mu1
mu[2:, :, :mu2.shape[2]] = mu2
t[:2] = t1
t[2:] = t2
spks[:2] = spks1
spks[2:] = spks2
cond[:2, :, :cond1.shape[2]] = cond1
cond[2:, :, :cond2.shape[2]] = cond2
estimator.set_input_shape('x', (x.size(0), 80, x.size(2)))
estimator.set_input_shape('mask', (x.size(0), 1, x.size(2)))
estimator.set_input_shape('mu', (x.size(0), 80, x.size(2)))
estimator.set_input_shape('t', (x.size(0),))
estimator.set_input_shape('spks', (x.size(0), 80))
estimator.set_input_shape('cond', (x.size(0), 80, x.size(2)))
# run trt engine
estimator.execute_v2([x.contiguous().data_ptr(),
mask.contiguous().data_ptr(),
mu.contiguous().data_ptr(),
t.contiguous().data_ptr(),
spks.contiguous().data_ptr(),
cond.contiguous().data_ptr(),
x.data_ptr()])
print(x)
padding后的推理结果不一致,不做padding的推理结果是一致的。
可以帮忙提供下排查思路吗,感谢~
mask部分针对batch逻辑修改一下
This issue is stale because it has been open for 30 days with no activity.
对导出onnx和tensorRT的代码做了修改:
测试代码
import sys import torchaudio import torch import tensorrt as trt flow_decoder_estimator_model = "pretrained_models/CosyVoice2-0.5B-dynamic_batch/flow.decoder.estimator.fp16.mygpu.plan" with open(flow_decoder_estimator_model, 'rb') as f: estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read()) if estimator_engine is None: raise ValueError('failed to load trt {}'.format(flow_decoder_estimator_model)) estimator = estimator_engine.create_execution_context()
x1 = torch.load("test1/x_1.pt") # 2 x 80 x 552 mask1 = torch.load("test1/mask_1.pt") # 2 x 1 x 552 mu1 = torch.load("test1/mu_1.pt") # 2 x 80 x 552 t1 = torch.load("test1/t_1.pt") # 2 spks1 = torch.load("test1/spks_1.pt") # 2 x 80 cond1 = torch.load("test1/cond_1.pt") # 2 x 80 x 552
x2 = torch.load("test2/x_1.pt") # 2 x 80 x 552 mask2 = torch.load("test2/mask_1.pt") # 2 x 1 x 552 mu2 = torch.load("test2/mu_1.pt") # 2 x 80 x 552 t2 = torch.load("test2/t_1.pt") # 2 spks2 = torch.load("test2/spks_1.pt") # 2 x 80 cond2 = torch.load("test2/cond_1.pt") # 2 x 80 x 552
x = torch.zeros((4, 80, 568)).to(x1) mask = torch.zeros((4, 1, 568)).to(x1) mu = torch.zeros((4, 80, 568)).to(x1) t = torch.zeros((4)).to(x1) spks = torch.zeros((4, 80)).to(x1) cond = torch.zeros((4, 80, 568)).to(x1)
x[:2, :, :x1.shape[2]] = x1 x[2:, :, :x2.shape[2]] = x2
mask[:2, :, :mask1.shape[2]] = mask1 mask[2:, :, :mask2.shape[2]] = mask2
mu[:2, :, :mu1.shape[2]] = mu1 mu[2:, :, :mu2.shape[2]] = mu2
t[:2] = t1 t[2:] = t2
spks[:2] = spks1 spks[2:] = spks2
cond[:2, :, :cond1.shape[2]] = cond1 cond[2:, :, :cond2.shape[2]] = cond2
estimator.set_input_shape('x', (x.size(0), 80, x.size(2))) estimator.set_input_shape('mask', (x.size(0), 1, x.size(2))) estimator.set_input_shape('mu', (x.size(0), 80, x.size(2))) estimator.set_input_shape('t', (x.size(0),)) estimator.set_input_shape('spks', (x.size(0), 80)) estimator.set_input_shape('cond', (x.size(0), 80, x.size(2)))
run trt engine
estimator.execute_v2([x.contiguous().data_ptr(), mask.contiguous().data_ptr(), mu.contiguous().data_ptr(), t.contiguous().data_ptr(), spks.contiguous().data_ptr(), cond.contiguous().data_ptr(), x.data_ptr()]) print(x) padding后的推理结果不一致,不做padding的推理结果是一致的。
可以帮忙提供下排查思路吗,感谢~
@WangGewu hi,这个flow的tensorrt怎么batch推理呢
试过,好像这个batch 推理,工程接入以后并没有改善 RTF啊。 组织一个缓存队列管理 满 batch的推理,里外耗时比 单batch推理还要慢一些。