TensorRT
TensorRT copied to clipboard
Set_binding_shape returns False
Description
Set_binding_shape returns False
import os
import sys
sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(__file__))))
import os.path
import numpy as np
from multiprocessing import cpu_count
import cv2
import shutil
from PIL import Image
from nvidia.dali import pipeline_def
import nvidia.dali.fn as fn
import nvidia.dali.types as types
import ctypes
import gc
import numpy as np
np.random.seed(1)
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
import time
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class TensorRTInference():
# 单个tensorRT模型
def __init__(self, engine_path,inputs=None, outputs=None) -> None:
self.logger = trt.Logger(trt.Logger.WARNING)
self.trt_runtime = trt.Runtime(self.logger)
self.engine = self.build_engine(engine_path,self.trt_runtime)
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(inputs, outputs)
self.context = self.engine.create_execution_context()
self.set_input_flag = False
def build_engine(self, engine_path, trt_runtime):
trt.init_libnvinfer_plugins(None, "")
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def allocate_buffers(self, inputs, outputs, batch_size=30):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for idx, binding in enumerate(self.engine):
size = trt.volume(self.engine.get_binding_shape(binding)) * batch_size
print(size)
host_mem = cuda.pagelocked_empty(size, dtype=trt.nptype(self.engine.get_binding_dtype(idx)))
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def exec(self):
# context.set_shape_input(idx, tensor) 动态
begin_time = time.time()
self.context.execute_async(batch_size=10, bindings=self.bindings, stream_handle=self.stream.handle)
print('execute_async Time: ', time.time() - begin_time)
begin_time = time.time()
for out in self.outputs:
cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
print("memcpy_dtoh_async Time: ", time.time() - begin_time)
begin_time = time.time()
self.stream.synchronize()
print('synchronize Time: ', time.time() - begin_time)
begin_time = time.time()
trt_output = [out.host for out in self.outputs]
# print(trt_output[0].max(), trt_output[0].min()) #输出结果问题不大
print('synchronize Time: ', time.time() - begin_time)
# 下一步解决tensorrt模型串联问题
return trt_output
def __call__(self):
# 执行模型
with self.context:
out = self.exec()
return out
def set_input(self, dummy_input):
# 设置输入格式
ret = self.context.set_binding_shape(0, list(dummy_input[0].shape()))
print(ret)
ret = self.context.set_binding_shape(1, [dummy_input[0].shape()[0], 1, dummy_input[0].shape()[2], dummy_input[0].shape()[3]])
print(ret)
# 设置输入数据地址
for idx, inp in enumerate(self.inputs):
# dummy_input[idx].copy_to_external(ptr=inp.device, cuda_stream=stream)
self.bindings[idx] = int(dummy_input[idx].data_ptr())
self.bindings[idx] = int(dummy_input[idx].data_ptr())
self.set_input_flag = True
def __del__(self):
try:
for inp in self.inputs:
# inp.host_mem.free()
inp.device_mem.free()
for out in self.outputs:
# out.host_mem.free()
out.device_mem.free()
self.logger.info('Free Allocate GPU Memory')
while 1:
try:
self.cuda_ctx.detach()
except:
break
except:
pass
del self.context
del self.engine
#del self.cuda_ctx
del self.stream
del self.outputs
del self.inputs
gc.collect()
class DaliStream():
def __init__(self, filenames, sequence_length=300, batch_size=1, num_threads=int(cpu_count()/2), device_id=0) -> None:
self.get_frame_num(filenames)
self.sequence_length = sequence_length
self.batch_size = batch_size
self.pipe = self.video_pipe(filenames=filenames,sequence_length=self.sequence_length, prefetch_queue_depth=1,
batch_size=self.batch_size, num_threads=num_threads, device_id=device_id)
self.pipe.build()
def init_video_pipe(self, filenames):
self.pipe = self.video_pipe(filenames=filenames,sequence_length=self.sequence_length, batch_size=self.batch_size, num_threads=num_threads, device_id=device_id)
self.pipe.build()
def get_frame_num(self, filenames=None):
if filenames is None:
filenames = self.filenames
self.stream = cv2.VideoCapture(filenames)
self.fps = int(self.stream.get(cv2.CAP_PROP_FPS))
self.size = int(self.stream.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
@staticmethod
def get_epoch_size(pipe):
# batch size 为1有效
meta = pipe.reader_meta()
return list(meta.values())[0]['epoch_size']
@pipeline_def
def video_pipe(self, filenames, sequence_length=300):
# if filenames is None:
# filenames = self.filenames
initial_prefetch_size= sequence_length
# 考虑到最后一个batch一般为有水印封底,所以不做padding
# 如果normalized,一定要设置dtype=types.DALIDataType.FLOAT类型
video = fn.readers.video(device="gpu", filenames=filenames, sequence_length=sequence_length,
# normalized=True,random_shuffle=False,dtype=types.DALIDataType.FLOAT,
initial_fill=initial_prefetch_size)
return video
def __getitem__(self):
pipe_out = self.pipe.run()
# 输出结果的地址目前无法固定,需要传给模型
return pipe_out[0][0]
def get_iter_num(self):
return self.size / self.sequence_length / self.batch_size
if __name__ == '__main__':
# # 加载模型
# engine_path = './model/tensorrt_model.engine'
# # 加载输入输出格式
# inputs = [np.zeros([1, 3, 224, 224], dtype=np.float32)]
# outputs = [np.zeros([1, 1, 224, 224], dtype=np.float32)]
# # 创建模型
# model = TensorRTInference(engine_path, inputs, outputs)
# # 设置输入格式
# model.set_input(inputs)
# # 执行模型
# model()
# # 输出结果
# print(outputs[0])
# # 删除模型
# del model
# gc.collect()
# # 删除模型后,内存会被释放,但是还有一个问题,就是模型的内存不能被释放,会导致内存泄漏,这里需要添加一个清理内存的方法,清理模型的内存。
video_filename = '/ssd1/xingyum/models/Implicit-Internal-Video-Inpainting/E2FGVI/examples/4f8f1fdf-72dc-49ed-90bd-3751689ee881.mp4'
dali_pipe = DaliStream(video_filename)
for i in range(dali_pipe.get_epoch_size(dali_pipe.pipe)):
output = dali_pipe.__getitem__()
# img = Image.fromarray((np.array(output[0][0].as_cpu())[0]*255).astype(np.uint8))
print(output)
# img.save('datas/test/test.png')
print(1)
Environment
TensorRT Version: NVIDIA GPU: NVIDIA Driver Version: CUDA Version: CUDNN Version: Operating System: Python Version (if applicable): Tensorflow Version (if applicable): PyTorch Version (if applicable): Baremetal or Container (if so, version):
Relevant Files
Steps To Reproduce
Have you check https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/ExecutionContext.html?highlight=set_binding_shape#tensorrt.IExecutionContext.set_binding_shape first?
Thank you!