onnxruntime icon indicating copy to clipboard operation
onnxruntime copied to clipboard

[Performance]

Open BenLag2906 opened this issue 8 months ago • 3 comments

Describe the issue

Hi,

I want to use onnx runtime, with the following piece of code, but for perfomance i need to batch my input with several images.

  • I use CUDA on UNIX with C++.

  • My input is a GpuMat of OpenCV.

Is there a simple way to use a dynamic batch in C++ ?

I convert my model with introducing a N on my onnx model.

To reproduce

  • \param input A single image. This float array has length of 3hw

    • \param output A byte array. should be freed by caller after use / std::vector Color_Engine::run_inference(cv::cuda::GpuMat input_gpu, int a ) { auto start_color = high_resolution_clock::now(); size_t input_height; size_t input_width; float model_input; size_t model_input_ele_count;

    const int64_t input_shape[] = {1, 64, 64, 3}; const size_t input_shape_len = sizeof(input_shape) / sizeof(input_shape[0]); const size_t model_input_len = model_input_ele_count * sizeof(float); std::vectorcv::cuda::GpuMat batch= std::vectorcv::cuda::GpuMat(); batch.push_back(input_gpu) ; cv::cuda::GpuMat input_gpu_chw= HWC_TO_CHW(batch,false);

    auto start_color_0 = high_resolution_clock::now(); Ort::MemoryInfo memory_info_cuda2("Cuda", OrtArenaAllocator, 0, OrtMemTypeDefault );

    std::array<int64_t, 4> shape{1, 64, 64, 3};

    size_t cuda_buffer_size = 3 * 64 * 64 * sizeof(float); float *cuda_resource;

    cudaMalloc((void**)&cuda_resource, cuda_buffer_size); cv::Mat left; cv::Mat dleft;

    cudaMemcpyAsync(cuda_resource, &(input_gpu_chw.data[0]), cuda_buffer_size, cudaMemcpyDeviceToDevice);

    auto start_color_1 = high_resolution_clock::now();

    auto ort_value = (Ort::Value::CreateTensor(memory_info_cuda2, cuda_resource, cuda_buffer_size, input_shape, input_shape_len, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT));

    Ort::Value * ort = & ort_value ;

    const char* input_names[] = {"x"}; const char* output_names[] = {"conv2d_15"}; OrtValue* output_tensor = NULL; auto start_color_2 = high_resolution_clock::now();

    ORT_ABORT_ON_ERROR( g_ort->Run(session, NULL, input_names, (const OrtValue* const*) ort, 1, output_names, 1, (OrtValue **) &output_tensor) //&input_tensor, 1, output_names, 1, (OrtValue **) &output_tensor) ); int ret = 0;

    auto stop_color = high_resolution_clock::now(); auto duration_color = duration_cast(stop_color - start_color); auto duration_color_0 = duration_cast(start_color_0 - start_color); auto duration_color_1 = duration_cast(start_color_1 - start_color_0); auto duration_color_2 = duration_cast(start_color_2 - start_color_1); auto duration_color_3 = duration_cast(stop_color - start_color_2); std::cout << "********* colorprocess: global " << duration_color.count() << " ns --- " << (1000000/duration_color.count()) << " fps" << std::endl; std::cout << "********* colorprocess: conversion " << duration_color_0.count() << " ns --- " << (1000000/duration_color_0.count()) << " fps" << std::endl; std::cout << "********* colorprocess: alloc " << duration_color_1.count() << " ns --- " << (1000000/duration_color_1.count()) << " fps" << std::endl; std::cout << "********* colorprocess: create Tensor " << duration_color_2.count() << " ns --- " << (1000000/duration_color_2.count()) << " fps" << std::endl; std::cout << "********* colorprocess: run " << duration_color_3.count() << " ns --- " << (1000000/duration_color_3.count()) << " fps" << std::endl;

    std::vector embed1= std::vector() ;

    struct OrtTensorTypeAndShapeInfo* shape_info; ORT_ABORT_ON_ERROR(g_ort->GetTensorTypeAndShape(output_tensor, &shape_info));

    size_t dim_count; ORT_ABORT_ON_ERROR(g_ort->GetDimensionsCount(shape_info, &dim_count));

    int64_t dims[4]; ORT_ABORT_ON_ERROR(g_ort->GetDimensions(shape_info, dims, sizeof(dims) / sizeof(dims[0])));

    float* ft ; ORT_ABORT_ON_ERROR(g_ort->GetTensorMutableData(output_tensor, (void**)&ft));

    size_t stride = dims[2]* dims[3]; int n = 0;

    for (int ii = 0; ii < 1616256; ++ii) { float f2 = (float) ft[ii];

    embed1.push_back(f2);

    }

    cudaFree(cuda_resource); return embed1; }

Urgency

No response

Platform

Linux

OS Version

UBUNTU

ONNX Runtime Installation

Built from Source

ONNX Runtime Version or Commit ID

#define ORT_API_VERSION 20

ONNX Runtime API

C++

Architecture

X64

Execution Provider

CUDA

Execution Provider Library Version

CUDA 11.8

Model File

No response

Is this a quantized model?

Yes

BenLag2906 avatar May 16 '25 09:05 BenLag2906

When you export onnx model, you can export a model with dyamic axes like the following example:

import torch

# Example input and model
dummy_input = torch.randn(1, 3, 224, 224)  # Batch size = 1
model = MyModel()

torch.onnx.export(
    model,
    dummy_input,
    "model.onnx",
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={
        "input": {0: "batch_size"},    # Axis 0 (batch size) is dynamic
        "output": {0: "batch_size"},   # Match the output to also have a dynamic batch
    }
)

tianleiwu avatar May 17 '25 19:05 tianleiwu

Hi,

Thanks for your answer.

The issue is my model is provided from Keras and not tensorFlow to use the conversion.

What i used to converted model to dynamic batch is the following piece of code in python.

Its running in python endvironnement, i can process output of a batch but it's not the case in C++ with a seg fault issue.

Could you please provide a suggestion ?

Best regards.

#***************************************************************

Endvironment : myconvert

#*************************************************************** from tensorflow.keras.models import model_from_json from tensorflow.keras.models import Model from sklearn.cluster import KMeans from collections import Counter, defaultdict import cv2 import onnx import tensorflow as tf import tf2onnx import onnx import onnxruntime as rt import numpy as np import pandas

json_file = open('convautoencodermodel_10.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights("convautoencodermodel_10.h5") jearsey_layer_model = Model(inputs=loaded_model.input,outputs=loaded_model.get_layer("conv2d_15").output)

#****************************************************************

Binding of inputs : name is essential for binding in c++

#**************************************************************** input_signature = [tf.TensorSpec([1, 64, 64, 3], tf.float32, name='x')] onnx_model, _ = tf2onnx.convert.from_keras(jearsey_layer_model, input_signature, opset=13) output_names = [n.name for n in onnx_model.graph.output] onnx.save(onnx_model, "./model.onnx")

def change_input_dim(model): # Use some symbolic name not used for any other dimension sym_batch_dim = "N" # or an actal value actual_batch_dim = 4 # The following code changes the first dimension of every input to be batch-dim # Modify as appropriate ... note that this requires all inputs to # have the same batch_dim inputs = model.graph.input for input in inputs: # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim. # Add checks as needed. dim1 = input.type.tensor_type.shape.dim[0] # update dim to be a symbolic value dim1.dim_param = sym_batch_dim # or update it to be an actual value: #dim1.dim_value = actual_batch_dim

def apply(transform, infile, outfile): model = onnx.load(infile) transform(model) onnx.save(model, outfile)

#apply(change_input_dim, "./model.onnx", "./model4N.onnx")

patches = [] patche = cv2.imread('visa.png') patche1 = cv2.imread('visa2.png') patches.append(patche) patches.append(patche1) patches.append(patche) patches.append(patche1)

providers = ['CPUExecutionProvider'] m = rt.InferenceSession("./modelN.onnx", providers=providers) onnx_pred = m.run(output_names, {"x": patches})

BenLag2906 avatar May 19 '25 11:05 BenLag2906

This issue has been automatically marked as stale due to inactivity and will be closed in 30 days if no further activity occurs. If further support is needed, please provide an update and/or more details.

github-actions[bot] avatar Jun 18 '25 15:06 github-actions[bot]

Applying stale label due to no activity in 30 days

Closing issue due to no activity in 30 days