[Performance]
Describe the issue
Hi,
I want to use onnx runtime, with the following piece of code, but for perfomance i need to batch my input with several images.
-
I use CUDA on UNIX with C++.
-
My input is a GpuMat of OpenCV.
Is there a simple way to use a dynamic batch in C++ ?
I convert my model with introducing a N on my onnx model.
To reproduce
-
\param input A single image. This float array has length of 3hw
- \param output A byte array. should be freed by caller after use
/
std::vector
Color_Engine::run_inference(cv::cuda::GpuMat input_gpu, int a ) { auto start_color = high_resolution_clock::now(); size_t input_height; size_t input_width; float model_input; size_t model_input_ele_count;
const int64_t input_shape[] = {1, 64, 64, 3}; const size_t input_shape_len = sizeof(input_shape) / sizeof(input_shape[0]); const size_t model_input_len = model_input_ele_count * sizeof(float); std::vectorcv::cuda::GpuMat batch= std::vectorcv::cuda::GpuMat(); batch.push_back(input_gpu) ; cv::cuda::GpuMat input_gpu_chw= HWC_TO_CHW(batch,false);
auto start_color_0 = high_resolution_clock::now(); Ort::MemoryInfo memory_info_cuda2("Cuda", OrtArenaAllocator, 0, OrtMemTypeDefault );
std::array<int64_t, 4> shape{1, 64, 64, 3};
size_t cuda_buffer_size = 3 * 64 * 64 * sizeof(float); float *cuda_resource;
cudaMalloc((void**)&cuda_resource, cuda_buffer_size); cv::Mat left; cv::Mat dleft;
cudaMemcpyAsync(cuda_resource, &(input_gpu_chw.data[0]), cuda_buffer_size, cudaMemcpyDeviceToDevice);
auto start_color_1 = high_resolution_clock::now();
auto ort_value = (Ort::Value::CreateTensor(memory_info_cuda2, cuda_resource, cuda_buffer_size, input_shape, input_shape_len, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT));
Ort::Value * ort = & ort_value ;
const char* input_names[] = {"x"}; const char* output_names[] = {"conv2d_15"}; OrtValue* output_tensor = NULL; auto start_color_2 = high_resolution_clock::now();
ORT_ABORT_ON_ERROR( g_ort->Run(session, NULL, input_names, (const OrtValue* const*) ort, 1, output_names, 1, (OrtValue **) &output_tensor) //&input_tensor, 1, output_names, 1, (OrtValue **) &output_tensor) ); int ret = 0;
auto stop_color = high_resolution_clock::now(); auto duration_color = duration_cast
(stop_color - start_color); auto duration_color_0 = duration_cast (start_color_0 - start_color); auto duration_color_1 = duration_cast (start_color_1 - start_color_0); auto duration_color_2 = duration_cast (start_color_2 - start_color_1); auto duration_color_3 = duration_cast (stop_color - start_color_2); std::cout << "********* colorprocess: global " << duration_color.count() << " ns --- " << (1000000/duration_color.count()) << " fps" << std::endl; std::cout << "********* colorprocess: conversion " << duration_color_0.count() << " ns --- " << (1000000/duration_color_0.count()) << " fps" << std::endl; std::cout << "********* colorprocess: alloc " << duration_color_1.count() << " ns --- " << (1000000/duration_color_1.count()) << " fps" << std::endl; std::cout << "********* colorprocess: create Tensor " << duration_color_2.count() << " ns --- " << (1000000/duration_color_2.count()) << " fps" << std::endl; std::cout << "********* colorprocess: run " << duration_color_3.count() << " ns --- " << (1000000/duration_color_3.count()) << " fps" << std::endl; std::vector
embed1= std::vector () ; struct OrtTensorTypeAndShapeInfo* shape_info; ORT_ABORT_ON_ERROR(g_ort->GetTensorTypeAndShape(output_tensor, &shape_info));
size_t dim_count; ORT_ABORT_ON_ERROR(g_ort->GetDimensionsCount(shape_info, &dim_count));
int64_t dims[4]; ORT_ABORT_ON_ERROR(g_ort->GetDimensions(shape_info, dims, sizeof(dims) / sizeof(dims[0])));
float* ft ; ORT_ABORT_ON_ERROR(g_ort->GetTensorMutableData(output_tensor, (void**)&ft));
size_t stride = dims[2]* dims[3]; int n = 0;
for (int ii = 0; ii < 1616256; ++ii) { float f2 = (float) ft[ii];
embed1.push_back(f2);
}
cudaFree(cuda_resource); return embed1; }
- \param output A byte array. should be freed by caller after use
/
std::vector
Urgency
No response
Platform
Linux
OS Version
UBUNTU
ONNX Runtime Installation
Built from Source
ONNX Runtime Version or Commit ID
#define ORT_API_VERSION 20
ONNX Runtime API
C++
Architecture
X64
Execution Provider
CUDA
Execution Provider Library Version
CUDA 11.8
Model File
No response
Is this a quantized model?
Yes
When you export onnx model, you can export a model with dyamic axes like the following example:
import torch
# Example input and model
dummy_input = torch.randn(1, 3, 224, 224) # Batch size = 1
model = MyModel()
torch.onnx.export(
model,
dummy_input,
"model.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size"}, # Axis 0 (batch size) is dynamic
"output": {0: "batch_size"}, # Match the output to also have a dynamic batch
}
)
Hi,
Thanks for your answer.
The issue is my model is provided from Keras and not tensorFlow to use the conversion.
What i used to converted model to dynamic batch is the following piece of code in python.
Its running in python endvironnement, i can process output of a batch but it's not the case in C++ with a seg fault issue.
Could you please provide a suggestion ?
Best regards.
#***************************************************************
Endvironment : myconvert
#*************************************************************** from tensorflow.keras.models import model_from_json from tensorflow.keras.models import Model from sklearn.cluster import KMeans from collections import Counter, defaultdict import cv2 import onnx import tensorflow as tf import tf2onnx import onnx import onnxruntime as rt import numpy as np import pandas
json_file = open('convautoencodermodel_10.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights("convautoencodermodel_10.h5") jearsey_layer_model = Model(inputs=loaded_model.input,outputs=loaded_model.get_layer("conv2d_15").output)
#****************************************************************
Binding of inputs : name is essential for binding in c++
#**************************************************************** input_signature = [tf.TensorSpec([1, 64, 64, 3], tf.float32, name='x')] onnx_model, _ = tf2onnx.convert.from_keras(jearsey_layer_model, input_signature, opset=13) output_names = [n.name for n in onnx_model.graph.output] onnx.save(onnx_model, "./model.onnx")
def change_input_dim(model): # Use some symbolic name not used for any other dimension sym_batch_dim = "N" # or an actal value actual_batch_dim = 4 # The following code changes the first dimension of every input to be batch-dim # Modify as appropriate ... note that this requires all inputs to # have the same batch_dim inputs = model.graph.input for input in inputs: # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim. # Add checks as needed. dim1 = input.type.tensor_type.shape.dim[0] # update dim to be a symbolic value dim1.dim_param = sym_batch_dim # or update it to be an actual value: #dim1.dim_value = actual_batch_dim
def apply(transform, infile, outfile): model = onnx.load(infile) transform(model) onnx.save(model, outfile)
#apply(change_input_dim, "./model.onnx", "./model4N.onnx")
patches = [] patche = cv2.imread('visa.png') patche1 = cv2.imread('visa2.png') patches.append(patche) patches.append(patche1) patches.append(patche) patches.append(patche1)
providers = ['CPUExecutionProvider'] m = rt.InferenceSession("./modelN.onnx", providers=providers) onnx_pred = m.run(output_names, {"x": patches})
This issue has been automatically marked as stale due to inactivity and will be closed in 30 days if no further activity occurs. If further support is needed, please provide an update and/or more details.
Applying stale label due to no activity in 30 days
Closing issue due to no activity in 30 days