Stuck and raise Error Code 2: Internal Error (Assertion memSize >= 0 failed. )
Description
[08/04/2022-17:24:38] [TRT] [V] *************** Autotuning Reformat: Float(E8,E8,E7,E6,(* 108 E1),108,1) where E0=(- 108 (- (# 3 (SHAPE input_1)) (* 108 (CEIL_DIV (+ (# 3 (SHAPE input_1)) -107) 108)))) E1=(CEIL_DIV (+ E3 -428) 432) E2=(- 60 (- (# 2 (SHAPE input_1)) (* 60 (CEIL_DIV (+ (# 2 (SHAPE input_1)) -59) 60)))) E3=(+ (- E0 (* 108 (TRUNC_DIV E0 108))) (# 3 (SHAPE input_1))) E4=(MIN (+ (- E2 (* 60 (TRUNC_DIV E2 60))) (# 2 (SHAPE input_1))) (- (# 2 (SHAPE input_2)) (FLOOR_DIV (# 2 (SHAPE input_2)) -1))) E5=(CEIL_DIV (+ E4 -236) 240) E6=(* 6480 E1) E7=(* (# 2 (RESHAPE 1 (# 1 (RESHAPE 1 1 (# 3 (RESHAPE 1 64 (+ (CEIL_DIV (+ E4 -4) 4) 1) (+ (CEIL_DIV (+ E3 -4) 4) 1) | 1 1 64 E5 60 E1 108 zeroIsPlaceholder)) E1 64 60 108 | 1 (* E1 E5) 414720 zeroIsPlaceholder)) 414720 | 1 1 E5 E1 64 60 108 zeroIsPlaceholder)) E6) E8=(* 64 E7) -> Float(E8,E8,E7,E6,1,6480,60) where E0=(- 108 (- (# 3 (SHAPE input_1)) (* 108 (CEIL_DIV (+ (# 3 (SHAPE input_1)) -107) 108)))) E1=(- 60 (- (# 2 (SHAPE input_1)) (* 60 (CEIL_DIV (+ (# 2 (SHAPE input_1)) -59) 60)))) E2=(+ (- E0 (* 108 (TRUNC_DIV E0 108))) (# 3 (SHAPE input_1))) E3=(MIN (+ (- E1 (* 60 (TRUNC_DIV E1 60))) (# 2 (SHAPE input_1))) (- (# 2 (SHAPE input_2)) (FLOOR_DIV (# 2 (SHAPE input_2)) -1))) E4=(CEIL_DIV (+ E2 -428) 432) E5=(CEIL_DIV (+ E3 -236) 240) E6=(* 6480 E4) E7=(* (# 2 (RESHAPE 1 (# 1 (RESHAPE 1 1 (# 3 (RESHAPE 1 64 (+ (CEIL_DIV (+ E3 -4) 4) 1) (+ (CEIL_DIV (+ E2 -4) 4) 1) | 1 1 64 E5 60 E4 108 zeroIsPlaceholder)) E4 64 60 108 | 1 (* E4 E5) 414720 zeroIsPlaceholder)) 414720 | 1 1 E5 E4 64 60 108 zeroIsPlaceholder)) E6) E8=(* 64 E7) ***************
[08/04/2022-17:24:38] [TRT] [V] Deleting timing cache: 1322 entries, 1595 hits
[08/04/2022-17:24:38] [TRT] [E] 2: [blockChooser.cpp::getRegionBlockSize::680] Error Code 2: Internal Error (Assertion memSize >= 0 failed. )
Traceback (most recent call last):
File "to_fp16.py", line 221, in
Environment
TensorRT Version: 8.4.1.5 NVIDIA GPU: nvidia A6000 NVIDIA Driver Version: 510.73.05 CUDA Version: 11.2 CUDNN Version: 11.2 Operating System: ubuntu 20.04 Python Version (if applicable): 3.7.11 PyTorch Version (if applicable): 1.12+cu113
Relevant Files
''' Spatial-Temporal Transformer Networks
'''
import os
os.environ['CUDA_VISIBLE_DEVICES']='9'
import numpy as np
import math
import torch
#
from torch2trt import torch2trt
torch._C._jit_set_bailout_depth(1)
import torch.nn as nn
import torch.nn.functional as F
import sys
sys.path.append('/ssd1/xingyum/models/STTN')
from core.spectral_norm import spectral_norm as _spectral_norm
class BaseNetwork(nn.Module):
def __init__(self):
super(BaseNetwork, self).__init__()
def print_network(self):
if isinstance(self, list):
self = self[0]
num_params = 0
for param in self.parameters():
num_params += param.numel()
print('Network [%s] was created. Total number of parameters: %.1f million. '
'To see the architecture, do print(network).' % (type(self).__name__, num_params / 1000000))
def init_weights(self, init_type='normal', gain=0.02):
'''
initialize network's weights
init_type: normal | xavier | kaiming | orthogonal
https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/9451e70673400885567d08a9e97ade2524c700d0/models/networks.py#L39
'''
def init_func(m):
classname = m.__class__.__name__
if classname.find('InstanceNorm2d') != -1:
if hasattr(m, 'weight') and m.weight is not None:
nn.init.constant_(m.weight.data, 1.0)
if hasattr(m, 'bias') and m.bias is not None:
nn.init.constant_(m.bias.data, 0.0)
elif hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1):
if init_type == 'normal':
nn.init.normal_(m.weight.data, 0.0, gain)
elif init_type == 'xavier':
nn.init.xavier_normal_(m.weight.data, gain=gain)
elif init_type == 'xavier_uniform':
nn.init.xavier_uniform_(m.weight.data, gain=1.0)
elif init_type == 'kaiming':
nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
elif init_type == 'orthogonal':
nn.init.orthogonal_(m.weight.data, gain=gain)
elif init_type == 'none': # uses pytorch's default init method
m.reset_parameters()
else:
raise NotImplementedError(
'initialization method [%s] is not implemented' % init_type)
if hasattr(m, 'bias') and m.bias is not None:
nn.init.constant_(m.bias.data, 0.0)
self.apply(init_func)
# propagate to children
for m in self.children():
if hasattr(m, 'init_weights'):
m.init_weights(init_type, gain)
class InpaintGenerator(BaseNetwork):
def __init__(self, init_weights=False):
super(InpaintGenerator, self).__init__()
channel = 256
stack_num = 8
patchsize = [(108, 60), (36, 20), (18, 10), (9, 5)]
blocks = []
for _ in range(stack_num):
blocks.append(TransformerBlock(patchsize, hidden=channel))
self.transformer = nn.ModuleList(blocks)
self.encoder = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(128, channel, kernel_size=3, stride=1, padding=1),
nn.LeakyReLU(0.2, inplace=True),
)
# decoder: decode frames from features
self.decoder = nn.Sequential(
deconv(channel, 128, kernel_size=3, padding=1),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
nn.LeakyReLU(0.2, inplace=True),
deconv(64, 64, kernel_size=3, padding=1),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(64, 3, kernel_size=3, stride=1, padding=1)
)
if init_weights:
self.init_weights()
# def forward(self, masked_frames, masks):
# # extracting features
# b, t, c, h, w = masked_frames.size()
# masks = masks.view(b*t, 1, h, w)
# enc_feat = self.encoder(masked_frames.view(b*t, c, h, w))
# _, c, h, w = enc_feat.size()
# masks = F.interpolate(masks, scale_factor=1.0/4)
# # enc_feat = self.transformer(
# # {'x': enc_feat, 'm': masks, 'b': b, 'c': c})['x']
# for layer in self.transformer:
# enc_feat = layer(enc_feat, masks, b, c)
# enc_feat = self.transformer([enc_feat, masks, b, c])[0]
# output = self.decoder(enc_feat)
# output = torch.tanh(output)
# return output
def mod(self, a, b):
out = a - a // b * b
return out
def forward(self, img, org_mask):
mod_size_h = 60 # 大小不足补全,这部分代码可以删除
mod_size_w = 108
B, C, h, w = img.shape
# h, w = img.size()[2:]
h_pad = self.mod((mod_size_h - self.mod(h, mod_size_h)), mod_size_h) # 这个可以只计算一次
w_pad = self.mod((mod_size_w - self.mod(w,mod_size_w)), mod_size_w ) # 这个可以只计算一次
feat = (img*(1-org_mask).float())
feat = torch.cat(
[feat, torch.flip(feat, [2])],
2)[:, :, :h + h_pad, :]
feat = torch.cat(
[feat, torch.flip(feat, [3])],
3)[:, :, :, :w + w_pad]
masks = torch.cat(
[org_mask, torch.flip(org_mask, [2])],
2)[:, :, :h + h_pad, :]
masks = torch.cat(
[masks, torch.flip(masks, [3])],
3)[:, :, :, :w + w_pad]
video_length = feat.size(0)
feat = self.encoder((feat*(1-masks).float()).view(video_length, 3, feat.size(-2), feat.size(-1)))
_, c, feat_h, feat_w = feat.size()
# feat = feat.view(1, video_length, c, feat_h, feat_w)
t, c, m_h, m_w = masks.size()
masks = masks.view(t, c, m_h, m_w)
masks = F.interpolate(masks, scale_factor=1.0/4)
t, c, _, _ = feat.size()
# enc_feat = self.transformer(
# {'x': feat, 'm': masks, 'b': 1, 'c': c})['x']
# enc_feat = self.transformer(
# feat, masks, 1, c)[0]
for layer in self.transformer:
feat = layer(feat, masks, 1, c)[0]
feat = self.decoder(feat)
feat = feat[:, :, :h, :w]
feat = (feat + 1) / 2
imgs = feat * org_mask + img *(1 - org_mask)
return feat
class deconv(nn.Module):
def __init__(self, input_channel, output_channel, kernel_size=3, padding=0):
super().__init__()
self.conv = nn.Conv2d(input_channel, output_channel,
kernel_size=kernel_size, stride=1, padding=padding)
def forward(self, x):
x = F.interpolate(x, scale_factor=2., mode='bilinear',
align_corners=True)
return self.conv(x)
class Attention(nn.Module):
"""
Compute 'Scaled Dot Product Attention
"""
def forward(self, query, key, value, m):
scores = torch.matmul(query, key.transpose(-2, -1)
) / math.sqrt(query.size(-1))
scores.masked_fill(m, -1e9)
p_attn = F.softmax(scores, dim=-1)
p_val = torch.matmul(p_attn, value)
return p_val, p_attn
class MultiHeadedAttention(nn.Module):
"""
Take in model size and number of heads.
"""
def __init__(self, patchsize, d_model):
super().__init__()
self.patchsize = patchsize
self.query_embedding = nn.Conv2d(
d_model, d_model, kernel_size=1, padding=0)
self.value_embedding = nn.Conv2d(
d_model, d_model, kernel_size=1, padding=0)
self.key_embedding = nn.Conv2d(
d_model, d_model, kernel_size=1, padding=0)
self.output_linear = nn.Sequential(
nn.Conv2d(d_model, d_model, kernel_size=3, padding=1),
nn.LeakyReLU(0.2, inplace=True))
self.attention = Attention()
def forward(self, x, m, b, c):
bt, _, h, w = x.size()
t = bt // b
d_k = c // len(self.patchsize)
output = []
_query = self.query_embedding(x)
_key = self.key_embedding(x)
_value = self.value_embedding(x)
for (width, height), query, key, value in zip(self.patchsize,
torch.chunk(_query, len(self.patchsize), dim=1), torch.chunk(
_key, len(self.patchsize), dim=1),
torch.chunk(_value, len(self.patchsize), dim=1)):
out_w, out_h = w // width, h // height
mm = m.view(b, t, 1, out_h, height, out_w, width)
mm = mm.permute(0, 1, 3, 5, 2, 4, 6).contiguous().view(
b, t*out_h*out_w, height*width)
mm = (mm.mean(-1) > 0.5).unsqueeze(1).repeat(1, t*out_h*out_w, 1)
# 1) embedding and reshape
query = query.view(b, t, d_k, out_h, height, out_w, width)
query = query.permute(0, 1, 3, 5, 2, 4, 6).contiguous().view(
b, t*out_h*out_w, d_k*height*width)
key = key.view(b, t, d_k, out_h, height, out_w, width)
key = key.permute(0, 1, 3, 5, 2, 4, 6).contiguous().view(
b, t*out_h*out_w, d_k*height*width)
value = value.view(b, t, d_k, out_h, height, out_w, width)
value = value.permute(0, 1, 3, 5, 2, 4, 6).contiguous().view(
b, t*out_h*out_w, d_k*height*width)
'''
# 2) Apply attention on all the projected vectors in batch.
tmp1 = []
for q,k,v in zip(torch.chunk(query, b, dim=0), torch.chunk(key, b, dim=0), torch.chunk(value, b, dim=0)):
y, _ = self.attention(q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0))
tmp1.append(y)
y = torch.cat(tmp1,1)
'''
y, _ = self.attention(query, key, value, mm)
# 3) "Concat" using a view and apply a final linear.
y = y.view(b, t, out_h, out_w, d_k, height, width)
y = y.permute(0, 1, 4, 2, 5, 3, 6).contiguous().view(bt, d_k, h, w)
output.append(y)
# print(output)
output = torch.cat(output, 1)
x = self.output_linear(output)
return x
# Standard 2 layerd FFN of transformer
class FeedForward(nn.Module):
def __init__(self, d_model):
super(FeedForward, self).__init__()
# We set d_ff as a default to 2048
self.conv = nn.Sequential(
nn.Conv2d(d_model, d_model, kernel_size=3, padding=2, dilation=2),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(d_model, d_model, kernel_size=3, padding=1),
nn.LeakyReLU(0.2, inplace=True))
def forward(self, x):
x = self.conv(x)
return x
class TransformerBlock(nn.Module):
"""
Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
"""
def __init__(self, patchsize, hidden=128):
super().__init__()
self.attention = MultiHeadedAttention(patchsize, d_model=hidden)
self.feed_forward = FeedForward(hidden)
def forward(self, x, m, b, c):
# x, m, b, c = w
# x, m, b, c = x['x'], x['m'], x['b'], x['c']
x = x + self.attention(x, m, b, c)
x = x + self.feed_forward(x)
# return {'x': x, 'm': m, 'b': b, 'c': c}
return x, m, b, c
# ######################################################################
# ######################################################################
class Discriminator(BaseNetwork):
def __init__(self, in_channels=3, use_sigmoid=False, use_spectral_norm=True, init_weights=True):
super(Discriminator, self).__init__()
self.use_sigmoid = use_sigmoid
nf = 64
self.conv = nn.Sequential(
spectral_norm(nn.Conv3d(in_channels=in_channels, out_channels=nf*1, kernel_size=(3, 5, 5), stride=(1, 2, 2),
padding=1, bias=not use_spectral_norm), use_spectral_norm),
# nn.InstanceNorm2d(64, track_running_stats=False),
nn.LeakyReLU(0.2, inplace=True),
spectral_norm(nn.Conv3d(nf*1, nf*2, kernel_size=(3, 5, 5), stride=(1, 2, 2),
padding=(1, 2, 2), bias=not use_spectral_norm), use_spectral_norm),
# nn.InstanceNorm2d(128, track_running_stats=False),
nn.LeakyReLU(0.2, inplace=True),
spectral_norm(nn.Conv3d(nf * 2, nf * 4, kernel_size=(3, 5, 5), stride=(1, 2, 2),
padding=(1, 2, 2), bias=not use_spectral_norm), use_spectral_norm),
# nn.InstanceNorm2d(256, track_running_stats=False),
nn.LeakyReLU(0.2, inplace=True),
spectral_norm(nn.Conv3d(nf * 4, nf * 4, kernel_size=(3, 5, 5), stride=(1, 2, 2),
padding=(1, 2, 2), bias=not use_spectral_norm), use_spectral_norm),
# nn.InstanceNorm2d(256, track_running_stats=False),
nn.LeakyReLU(0.2, inplace=True),
spectral_norm(nn.Conv3d(nf * 4, nf * 4, kernel_size=(3, 5, 5), stride=(1, 2, 2),
padding=(1, 2, 2), bias=not use_spectral_norm), use_spectral_norm),
# nn.InstanceNorm2d(256, track_running_stats=False),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv3d(nf * 4, nf * 4, kernel_size=(3, 5, 5),
stride=(1, 2, 2), padding=(1, 2, 2))
)
if init_weights:
self.init_weights()
def forward(self, xs):
# T, C, H, W = xs.shape
xs_t = torch.transpose(xs, 0, 1)
xs_t = xs_t.unsqueeze(0) # B, C, T, H, W
feat = self.conv(xs_t)
if self.use_sigmoid:
feat = torch.sigmoid(feat)
out = torch.transpose(feat, 1, 2) # B, T, C, H, W
return out
def spectral_norm(module, mode=True):
if mode:
return _spectral_norm(module)
return module
if __name__ == '__main__':
import time
device = 'cuda:0'
model = InpaintGenerator().to(device)
feat_i = torch.randn(6, 3, 720, 1280).to(device)
mask_i = torch.randn(6, 1, 720, 1280).to(device)
data = torch.load('./checkpoints/sttn.pth', map_location=device)
model.load_state_dict(data['netG'])
model.eval()
# model_trt = torch2trt(model, [feat_i, mask_i])
# model = torch.jit.script(model)
# model(feat_i, mask_i)
torch.onnx.export(model,
args=(feat_i, mask_i),
opset_version=13,
export_params=True,
do_constant_folding=True,
f="alexnet.onnx",
dynamic_axes={'input_1':{0:'batch', 2:'width', 3:'height'}, "input_2":{0:'batch', 2:'width', 3:'height'},
'output1':{0:'batch', 2:'width', 3:'height'}},
input_names=["input_1", "input_2"],
output_names=["output1"])
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '9'
import tensorrt as trt
import logging
import sys
import os
import argparse
import time
sys.path.insert(1, os.path.join(sys.path[0], ".."))
parser = argparse.ArgumentParser(description="Retinaface")
parser.add_argument(
"--onnx_file", default="alexnet.onnx", type=str)
parser.add_argument(
"--engine_file", default="test2.engine", type=str)
args = parser.parse_args()
# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
# TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
logging.basicConfig(level=logging.DEBUG,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S")
logger = logging.getLogger(__name__)
def check_network(network):
if not network.num_outputs:
logger.warning(
"No output nodes found, marking last layer's outputs as network outputs. Correct this if wrong.")
inputs = [network.get_input(i) for i in range(network.num_inputs)]
outputs = [network.get_output(i) for i in range(network.num_outputs)]
max_len = max([len(inp.name) for inp in inputs] +
[len(out.name) for out in outputs])
logger.debug("=== Network Description ===")
for i, inp in enumerate(inputs):
logger.debug("Input {0} | Name: {1:{2}} | Shape: {3}".format(
i, inp.name, max_len, inp.shape))
for i, out in enumerate(outputs):
logger.debug("Output {0} | Name: {1:{2}} | Shape: {3}".format(
i, out.name, max_len, out.shape))
def create_crnn_profiles(builder, inputs):
profile = builder.create_optimization_profile()
for inp in inputs:
print("-------------------\n\n", inp.name , "\n-++++++++++++++++++++-")
if inp.name == "input_1":
opt_shape = trt.Dims([1, 3, 720, 1280])
max_shape = trt.Dims([1, 3, 820, 1300])
min_shape = trt.Dims([1, 3, 576, 1025])
# min_shape = trt.Dims([1, 3, 112, 112])
# opt_shape = trt.Dims([1, 3, 224, 224])
# max_shape = trt.Dims([1, 3, 512, 512])
profile.set_shape(inp.name, min=min_shape,
opt=opt_shape, max=max_shape)
else:
opt_shape = trt.Dims([1, 1, 720, 1280])
max_shape = trt.Dims([1, 1, 820, 1300])
min_shape = trt.Dims([1, 1, 576, 1025])
profile.set_shape(inp.name, min=min_shape,
opt=opt_shape, max=max_shape)
return [profile]
def add_profiles(config, inputs, opt_profiles):
logger.debug("=== Optimization Profiles ===")
for i, profile in enumerate(opt_profiles):
for inp in inputs:
_min, _opt, _max = profile.get_shape(inp.name)
logger.debug(
"{} - OptProfile {} - Min {} Opt {} Max {}".format(inp.name, i, _min, _opt, _max))
config.add_optimization_profile(profile)
return config
def StrOfSize(size):
def strofsize(integer, remainder, level):
if integer >= 1024:
remainder = integer % 1024
integer //= 1024
level += 1
return strofsize(integer, remainder, level)
else:
return integer, remainder, level
units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
integer, remainder, level = strofsize(size, 0, 0)
if level+1 > len(units):
level = -1
return ('{}.{:>03d}{}'.format(integer, remainder, units[level]))
def build_engine_onnx(model_file):
# meaning that your network definition must be created with the explicitBatch flag set
explicit_batch = 1 << (int)(
trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(explicit_batch) as network, \
builder.create_builder_config() as config, \
trt.OnnxParser(network, TRT_LOGGER) as parser:
config.set_flag(trt.BuilderFlag.FP16)
# builder.max_workspace_size = 0 # 7.2 版本使用
# builder.max_workspace_size = 1 ** 30
builder.max_batch_size = 10 # 1GiB
config.max_workspace_size = 1 << 30 # 1GiB
# config.max_workspace_size = 1 ** 30 # 1GiB
# Load the Onnx model and parse it in order to populate the TensorRT network.
print("model_file", model_file)
with open(model_file, 'rb') as model:
print("model file successfully opened")
if not parser.parse(model.read()):
print('ERROR: Failed to parse the ONNX file: {}'.format(model_file))
print('ERROR: Failed to parse the ONNX file.')
print("got {} errors: ".format(parser.num_errors))
for error in range(parser.num_errors):
print(parser.get_error(error))
# print(error.code(), error.desc(), error.node())
return None
else:
print("parse successful")
check_network(network)
inputs = [network.get_input(i) for i in range(network.num_inputs)]
opt_profiles = create_crnn_profiles(builder, inputs)
add_profiles(config, inputs, opt_profiles)
logger.info("Building Engine...")
start_time = time.time()
# Specify one or more optimization profiles at build time using config
# This enables the builder to build multiple engines based on the same network definition, but with different builder configurations.
with builder.build_engine(network, config) as engine, open(args.engine_file, "wb") as f:
logger.info(
"Serializing engine to file: {:}".format(args.engine_file))
f.write(engine.serialize())
print("building engine cost %f seconds" %
(time.time() - start_time))
print("complete!")
print("")
print("config.max_workspace_size is",
StrOfSize(config.max_workspace_size))
print("")
print("size of onnx file is", StrOfSize(
os.path.getsize(args.onnx_file)))
print("size of trt engine file is", StrOfSize(
os.path.getsize(args.engine_file)))
print("")
print("")
def get_engine(onnx_file_path, engine_file_path=""):
"""Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
def build_engine():
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
explicit_batch = 1 << (int)(
trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(explicit_batch) as network, \
builder.create_builder_config() as config, \
trt.OnnxParser(network, TRT_LOGGER) as parser, \
trt.Runtime(TRT_LOGGER) as runtime:
config.max_workspace_size = 1 << 28 # 256MiB
config.set_flag(trt.BuilderFlag.FP16)
builder.max_batch_size = 1
# Parse model file
if not os.path.exists(onnx_file_path):
print('ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(
onnx_file_path))
exit(0)
print('Loading ONNX file from path {}...'.format(onnx_file_path))
with open(onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()):
print('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
# The actual yolov3.onnx is generated with batch size 64. Reshape input to batch size 1
check_network(network)
_ = [network.get_input(i) for i in range(network.num_inputs)]
print('Completed parsing of ONNX file')
print('Building an engine from file {}; this may take a while...'.format(
onnx_file_path))
start_time = time.time()
plan = builder.build_serialized_network(network, config)
engine = runtime.deserialize_cuda_engine(plan)
print("Completed creating Engine")
with open(engine_file_path, "wb") as f:
f.write(plan)
print("building engine cost %f seconds" %
(time.time() - start_time))
print("complete!")
return engine
if os.path.exists(engine_file_path):
# If a serialized engine exists, use it instead of building an engine.
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
else:
return build_engine()
if __name__ == '__main__':
onnx_model_file = args.onnx_file
build_engine_onnx(onnx_model_file)
#get_engine(args.onnx_file, args.engine_file)
Steps To Reproduce
The first script exported the torch onnx model, the second exported the tensorrt model, the first time I exported the same code successfully, the second time it got stuck, I killed the stuck program, and the subsequent export could not be successful, and the blockChooser would be reported .cpp::getRegionBlockSize::666] Error Code 2: Internal Error (Assertion memSize >= 0 failed. ) error
Can you share the exported onnx model here? Thanks!
At a first glance I thinks I can't export the onnx using your model due to below lines:
sys.path.append('/ssd1/xingyum/models/STTN')
from core.spectral_norm import spectral_norm as _spectral_norm
Why do these two lines cause the model export to fail? Both the model file and the core.spectral_norm file are in the link below. https://drive.google.com/drive/folders/1SBiykgPPVzkvyy7IjACZQVw2lrxAe1Xa?usp=sharing
I can reproduce this and I've filed an internal bug to track it, thanks for reporting this.
BTW if you want to WAR this in short term, would it be possible to use a static shape? e.g.
./trtexec --onnx=alexnet.onnx --fp16 --optShapes=input_1:1x3x720x1280,input_2:1x1x720x1280
works for me
This will be fixed in the next major version. thanks again for reporting this :-)
Thank you!