TensorRT icon indicating copy to clipboard operation
TensorRT copied to clipboard

Set_binding_shape returns False

Open 980202006 opened this issue 3 years ago • 1 comments

Description

Set_binding_shape returns False

import os
import sys
sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(__file__))))
import os.path
import numpy as np
from multiprocessing import cpu_count
import cv2
import shutil
from PIL import Image
from nvidia.dali import pipeline_def
import nvidia.dali.fn as fn
import nvidia.dali.types as types
import ctypes
import gc
import numpy as np
np.random.seed(1)
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
import time

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class TensorRTInference():
    # 单个tensorRT模型
    def __init__(self, engine_path,inputs=None, outputs=None) -> None:
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.trt_runtime = trt.Runtime(self.logger)
        self.engine = self.build_engine(engine_path,self.trt_runtime)
        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(inputs, outputs)
        self.context = self.engine.create_execution_context()
        self.set_input_flag = False
    def build_engine(self, engine_path, trt_runtime):
        trt.init_libnvinfer_plugins(None, "")             
        with open(engine_path, 'rb') as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine
    def allocate_buffers(self, inputs, outputs, batch_size=30):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        for idx, binding in enumerate(self.engine):
            size = trt.volume(self.engine.get_binding_shape(binding)) * batch_size
            print(size)
            host_mem = cuda.pagelocked_empty(size, dtype=trt.nptype(self.engine.get_binding_dtype(idx)))
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings.append(int(device_mem))

            if self.engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))
        return inputs, outputs, bindings, stream
    
    def exec(self):
        # context.set_shape_input(idx, tensor) 动态
        begin_time = time.time()
        self.context.execute_async(batch_size=10, bindings=self.bindings, stream_handle=self.stream.handle)
        print('execute_async Time: ', time.time() - begin_time)
        begin_time = time.time()
        for out in self.outputs:
            cuda.memcpy_dtoh_async(out.host, out.device, self.stream) 
        print("memcpy_dtoh_async Time: ", time.time() - begin_time)
        begin_time = time.time()
        self.stream.synchronize()
        print('synchronize Time: ', time.time() - begin_time)
        begin_time = time.time()
        trt_output = [out.host for out in self.outputs]
        # print(trt_output[0].max(), trt_output[0].min())  #输出结果问题不大
        print('synchronize Time: ', time.time() - begin_time)
        # 下一步解决tensorrt模型串联问题
        return trt_output

    def __call__(self):
        # 执行模型
        with self.context:
            out = self.exec()
        return out
    
    def set_input(self, dummy_input):
        # 设置输入格式
        ret = self.context.set_binding_shape(0, list(dummy_input[0].shape()))
        print(ret)
        ret = self.context.set_binding_shape(1, [dummy_input[0].shape()[0], 1, dummy_input[0].shape()[2], dummy_input[0].shape()[3]])
        print(ret)
        # 设置输入数据地址
        for idx, inp in enumerate(self.inputs):
        # dummy_input[idx].copy_to_external(ptr=inp.device, cuda_stream=stream)
            self.bindings[idx] = int(dummy_input[idx].data_ptr())
            self.bindings[idx] = int(dummy_input[idx].data_ptr())
        self.set_input_flag = True
    def __del__(self):
        try:
            for inp in self.inputs:
                # inp.host_mem.free()
                inp.device_mem.free()
            for out in self.outputs:
                # out.host_mem.free()
                out.device_mem.free()
            self.logger.info('Free Allocate GPU Memory')
            while 1:
                try:
                    self.cuda_ctx.detach()
                except:
                    break
        except:
            pass
        del self.context
        del self.engine
        #del self.cuda_ctx
        del self.stream
        del self.outputs
        del self.inputs
        gc.collect()


class DaliStream():
    def __init__(self, filenames, sequence_length=300, batch_size=1, num_threads=int(cpu_count()/2), device_id=0) -> None:
        self.get_frame_num(filenames)
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.pipe = self.video_pipe(filenames=filenames,sequence_length=self.sequence_length, prefetch_queue_depth=1,
            batch_size=self.batch_size, num_threads=num_threads, device_id=device_id)
        self.pipe.build()
    def init_video_pipe(self, filenames):
        self.pipe = self.video_pipe(filenames=filenames,sequence_length=self.sequence_length, batch_size=self.batch_size, num_threads=num_threads, device_id=device_id)
        self.pipe.build()
    def get_frame_num(self, filenames=None):
        if filenames is None:
            filenames = self.filenames
        self.stream = cv2.VideoCapture(filenames)
        self.fps = int(self.stream.get(cv2.CAP_PROP_FPS))
        self.size = int(self.stream.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
    @staticmethod
    def get_epoch_size(pipe):
        # batch size 为1有效
        meta = pipe.reader_meta()
        return list(meta.values())[0]['epoch_size']
    @pipeline_def
    def video_pipe(self, filenames, sequence_length=300):
        # if filenames is None:
        #     filenames = self.filenames
        initial_prefetch_size= sequence_length
        # 考虑到最后一个batch一般为有水印封底,所以不做padding
        # 如果normalized,一定要设置dtype=types.DALIDataType.FLOAT类型
        video = fn.readers.video(device="gpu", filenames=filenames, sequence_length=sequence_length,
                                # normalized=True,random_shuffle=False,dtype=types.DALIDataType.FLOAT,
                                initial_fill=initial_prefetch_size)
        return video
    
    def __getitem__(self):
        pipe_out = self.pipe.run()
        # 输出结果的地址目前无法固定,需要传给模型
        return pipe_out[0][0]
    
    def get_iter_num(self):
        return self.size / self.sequence_length / self.batch_size

if __name__ == '__main__':
    # # 加载模型
    # engine_path = './model/tensorrt_model.engine'
    # # 加载输入输出格式
    # inputs = [np.zeros([1, 3, 224, 224], dtype=np.float32)]
    # outputs = [np.zeros([1, 1, 224, 224], dtype=np.float32)]
    # # 创建模型
    # model = TensorRTInference(engine_path, inputs, outputs)
    # # 设置输入格式
    # model.set_input(inputs)
    # # 执行模型
    # model()
    # # 输出结果
    # print(outputs[0])
    # # 删除模型
    # del model
    # gc.collect()
    # # 删除模型后,内存会被释放,但是还有一个问题,就是模型的内存不能被释放,会导致内存泄漏,这里需要添加一个清理内存的方法,清理模型的内存。
    video_filename = '/ssd1/xingyum/models/Implicit-Internal-Video-Inpainting/E2FGVI/examples/4f8f1fdf-72dc-49ed-90bd-3751689ee881.mp4'
    dali_pipe = DaliStream(video_filename)
    for i in range(dali_pipe.get_epoch_size(dali_pipe.pipe)):
        output = dali_pipe.__getitem__()
        # img = Image.fromarray((np.array(output[0][0].as_cpu())[0]*255).astype(np.uint8))

        print(output)
        # img.save('datas/test/test.png')
        print(1)

Environment

TensorRT Version: NVIDIA GPU: NVIDIA Driver Version: CUDA Version: CUDNN Version: Operating System: Python Version (if applicable): Tensorflow Version (if applicable): PyTorch Version (if applicable): Baremetal or Container (if so, version):

Relevant Files

Steps To Reproduce

980202006 avatar Aug 05 '22 08:08 980202006

Have you check https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/ExecutionContext.html?highlight=set_binding_shape#tensorrt.IExecutionContext.set_binding_shape first?

zerollzeng avatar Aug 07 '22 11:08 zerollzeng

Thank you!

980202006 avatar Aug 15 '22 06:08 980202006