PaddleDetection ONNXRuntimeError: Non-zero status code returned while running Gather node

问题确认 Search before asking

[X] 我已经查询历史issue，没有发现相似的bug。I have searched the issues and found no similar bug report.

Bug组件 Bug Component

Inference, Deploy

Bug描述 Describe the Bug

PicoDet_m416 转换为ONNX格式，进行推理视频的时候报错

训练类别数量为 1

onnxruntime.capi.onnxruntime_pybind11_state.RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Gather node. Name:'p2o.Gather.8' Status Message: /onnxruntime_src/onnxruntime/core/providers/common.h:24 int64_t onnxruntime::HandleNegativeAxis(int64_t, int64_t) axis >= -tensor_rank && axis <= tensor_rank - 1 was false. axis 0 is not in valid range [-0,-1]

测试命令: python3 deploy/third_engine/onnx/infer.py --infer_cfg self_project/convert_model/Person_picodet_m416_data2600/infer_cfg.yml --onnx_file self_project/onnx/Person_picom416_data2600_best.onnx --video_file demo_mp4/person_test_video.mp4 --output_dir other_img_res (在PaddleDetection根目录下执行)
模型文件: ONNX
配置文件: YAML 修改的文件内容如下
PaddleDetection/deploy/third_engine/onnx/infer.py 内容如下

import os
import cv2
import yaml
import datetime
import argparse
import numpy as np
import glob
from onnxruntime import InferenceSession

from preprocess import Compose

# Global dictionary
SUPPORT_MODELS = {
    'YOLO', 'RCNN', 'SSD', 'Face', 'FCOS', 'SOLOv2', 'TTFNet', 'S2ANet', 'JDE',
    'FairMOT', 'DeepSORT', 'GFL', 'PicoDet', 'CenterNet', 'TOOD', 'RetinaNet',
    'StrongBaseline', 'STGCN', 'YOLOX', 'HRNet'
}

current_path = os.path.dirname(os.path.abspath(__file__))
print(type(current_path), current_path)
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--infer_cfg", type=str, help="infer_cfg.yml")
parser.add_argument(
    '--onnx_file', type=str, default="model.onnx", help="onnx model file path")
parser.add_argument("--image_dir", type=str)
parser.add_argument("--image_file", type=str)
parser.add_argument("--video_file", type=str, default=None)
parser.add_argument("--output_dir", type=str, default=current_path + "/output", help="final output dir")


def get_test_images(infer_dir, infer_img):
    """
    Get image path list in TEST mode
    """
    assert infer_img is not None or infer_dir is not None, \
        "--image_file or --image_dir should be set"
    assert infer_img is None or os.path.isfile(infer_img), \
        "{} is not a file".format(infer_img)
    assert infer_dir is None or os.path.isdir(infer_dir), \
        "{} is not a directory".format(infer_dir)

    # infer_img has a higher priority
    if infer_img and os.path.isfile(infer_img):
        return [infer_img]

    images = set()
    infer_dir = os.path.abspath(infer_dir)
    assert os.path.isdir(infer_dir), \
        "infer_dir {} is not a directory".format(infer_dir)
    exts = ['jpg', 'jpeg', 'png', 'bmp']
    exts += [ext.upper() for ext in exts]
    for ext in exts:
        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
    images = list(images)

    assert len(images) > 0, "no image found in {}".format(infer_dir)
    print("Found {} inference images in total.".format(len(images)))

    return images


class PredictConfig(object):
    """set config of preprocess, postprocess and visualize
    Args:
        infer_config (str): path of infer_cfg.yml
    """

    def __init__(self, infer_config):
        # parsing Yaml config for Preprocess
        with open(infer_config) as f:
            yml_conf = yaml.safe_load(f)
        self.check_model(yml_conf)  # 检测当前模型架构是否支持
        self.arch = yml_conf['arch']
        self.preprocess_infos = yml_conf['Preprocess']
        self.min_subgraph_size = yml_conf['min_subgraph_size']
        self.label_list = yml_conf['label_list']
        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
        self.draw_threshold = yml_conf.get("draw_threshold", 0.5)
        self.mask = yml_conf.get("mask", False)
        self.tracker = yml_conf.get("tracker", None)
        self.nms = yml_conf.get("NMS", None)
        self.fpn_stride = yml_conf.get("fpn_stride", None)
        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
            print(
                'The RCNN export model is used for ONNX and it only supports batch_size = 1'
            )
        self.print_config()

    def check_model(self, yml_conf):
        """
        Raises:
            ValueError: loaded model not in supported model type
        """
        for support_model in SUPPORT_MODELS:
            if support_model in yml_conf['arch']:
                return True
        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
                                                                      'arch'], SUPPORT_MODELS))

    def print_config(self):
        print('-----------  Model Configuration -----------')
        print('%s: %s' % ('Model Arch', self.arch))
        print('%s: ' % ('Transform Order'))
        for op_info in self.preprocess_infos:
            print('--%s: %s' % ('transform op', op_info['type']))
        print('--------------------------------------------')


def predict_image(infer_config, predictor, img_list, out_dir):
    # load preprocess transforms
    transforms = Compose(infer_config.preprocess_infos)  # 配置当前模型的预处理
    # predict image
    for img_path in img_list:
        inputs = transforms(img_path)  # 对输入图片 进行预处理
        img = inputs['image']   # 处理后的图像
        # print('%s: %s' % ('inputs', inputs))
        
        bgr_img = cv2.imread(img_path)
        img_name = img_path.split('/')[-1]
        
        inputs_name = [var.name for var in predictor.get_inputs()]  # 获取 模型输入的变量名称
        # print('%s: %s' % ('inputs_name', inputs_name))
        inputs = {k: inputs[k][None,] for k in inputs_name}     # 填充模型输入变量
        # print('%s: %s' % ('inputs', inputs))

        outputs = predictor.run(output_names=None, input_feed=inputs)

        print("ONNXRuntime predict: ")
        if infer_config.arch in ["HRNet"]:
            print(np.array(outputs[0]))
        else:
            bboxes = np.array(outputs[0])
            for bbox in bboxes:
                if bbox[0] > -1 and bbox[1] > infer_config.draw_threshold:
                    print(f"{int(bbox[0])} {bbox[1]} "
                          f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}")
                    cv2.rectangle(bgr_img, (int(bbox[2]), int(bbox[3])), (int(bbox[4]), int(bbox[5])), (255, 0, 0), 2)
            save_path = os.path.join(out_dir, img_name)
            cv2.imwrite(save_path, bgr_img)
            print(f'{img_name} saved in {out_dir}')


def predict_video(infer_config, predictor, video_file, out_dir):
    capture = cv2.VideoCapture(video_file)
    fps = int(capture.get(cv2.CAP_PROP_FPS))
    frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
    print("video fps: %d, frame_count: %d" % (fps, frame_count))

    transforms = Compose(infer_config.preprocess_infos)
    
    index = 0
    while 1:
        index += 1
        ret, frame = capture.read()
        if not ret:
            break
        if index % fps == 0:  # 每隔12帧取一次 视频帧率为24f/s 即1s只取2张图片
            img_name = 'frame_' + str(index) + '.png'
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            inputs = transforms.other_call(frame_rgb)
            inputs_name = [var.name for var in predictor.get_inputs()]
            inputs = {k: inputs[k][None,] for k in inputs_name}
            
            outputs = predictor.run(output_names=None, input_feed=inputs)
            print("ONNXRuntime predict: ")
            
            if infer_config.arch in ["HRNet"]:
                print(np.array(outputs[0]))
            else:
                bboxes = np.array(outputs[0])
                for bbox in bboxes:
                    if bbox[0] > -1 and bbox[1] > infer_config.draw_threshold:
                        print(f"{int(bbox[0])} {bbox[1]} "
                            f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}")
                        cv2.rectangle(frame, (int(bbox[2]), int(bbox[3])), (int(bbox[4]), int(bbox[5])), (255, 0, 0), 2)
                save_path = os.path.join(out_dir, img_name)
                cv2.imwrite(save_path, frame)
                print(f'{img_name} saved in {out_dir}')


if __name__ == '__main__':
    
    FLAGS = parser.parse_args()
    # create dir
    if not os.path.exists(FLAGS.output_dir):
        os.mkdir(FLAGS.output_dir)
    
    # load predictor
    predictor = InferenceSession(FLAGS.onnx_file)
    # load infer config
    infer_config = PredictConfig(FLAGS.infer_cfg)
    if FLAGS.video_file is None:
        # load image list
        img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)

        start_time = datetime.datetime.now()
        predict_image(infer_config, predictor, img_list, FLAGS.output_dir)
        end_time = datetime.datetime.now()
        print("Total Time: %ss" % (end_time - start_time).total_seconds())  # 总共0.34ms
    else:
        print('Video Detect!')
        predict_video(infer_config, predictor, FLAGS.video_file, FLAGS.output_dir)

PaddleDetection/deploy/third_engine/onnx/preprocess.py 末尾 class Compose中添加

def other_call(self, img):
        im_info = {
            "im_shape": np.array(
                img.shape[:2], dtype=np.float32),
            "scale_factor": np.array(
                [1., 1.], dtype=np.float32)
        }
        for t in self.transforms:
            img, im_info = t(img, im_info)
        inputs = copy.deepcopy(im_info)
        inputs['image'] = img
        return inputs

复现环境 Environment

OS: Linux
PaddlePaddle: paddlepaddle-gpu 2.3.2
PaddleDetection: release/2.6
Python: 3.7
CUDA: 11.0
CUDNN: 7.6

Bug描述确认 Bug description confirmation

[X] 我确认已经提供了Bug复现步骤、代码改动说明、以及环境信息，确认问题是可以复现的。I confirm that the bug replication steps, code change instructions, and environment information have been provided, and the problem can be reproduced.

是否愿意提交PR？ Are you willing to submit a PR?

[ ] 我愿意提交PR！I'd like to help by submitting a PR!

Apr 28 '23 08:04 RRRROBOT

2.4.2依然存在 RuntimeException Traceback (most recent call last) Cell In[9], line 76 73 plt.imshow(img[:,:,::-1]) 74 plt.show() ---> 76 infer_sim_picodet(img_file, picdet_onnx_model)

Cell In[9], line 54, in infer_sim_picodet(img_file, picdet_onnx_model) 52 print('preprocess', pre_end-pre_start,'s') 53 start = time.time() ---> 54 raw_result = session.run([], {input_name: input_data, 55 'scale_factor':image_size}) 56 end = time.time() 57 print('推理时间：', end-start,'s')

File ~/anaconda3/envs/PaddlePaddle/lib/python3.9/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py:217, in Session.run(self, output_names, input_feed, run_options) 215 output_names = [output.name for output in self._outputs_meta] 216 try: --> 217 return self._sess.run(output_names, input_feed, run_options) 218 except C.EPFail as err: 219 if self._enable_fallback:

RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Gather node. Name:'p2o.Gather.8' Status Message: /onnxruntime_src/onnxruntime/core/providers/common.h:23 int64_t onnxruntime::HandleNegativeAxis(int64_t, int64_t) axis >= -tensor_rank && axis <= tensor_rank - 1 was false. axis 0 is not in valid range [-0,-1]

May 25 '23 17:05 jinyaxuan

遇到同样问题，有解决办法吗？

Sep 06 '23 09:09 shengjie1980

我使用 picodet_lcnet_x1_0_layout.yml 训练的模型转 ONNX 后推理也会出现这个错误。

onnxruntime.capi.onnxruntime_pybind11_state.RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Gather node. Name:'p2o.Gather.8' Status Message: /onnxruntime_src/onnxruntime/core/providers/common.h:31 int64_t onnxruntime::HandleNegativeAxis(int64_t, int64_t) IsAxisInRange(axis, tensor_rank) was false. axis 0 is not in valid range [-0,-1]

Nov 23 '23 10:11 elonzh

遇见相同的问题，不知怎么解决，最后通过剪枝，重写后处理完成推理，希望官方能解决这个bug

Mar 19 '24 08:03 gk966988

我跑yolox的，经过多次实验，我终于确定了一个最有可能引起这个问题的地方，以yolox举例，在cspdarknet.yml中有一个参数是score_threshold，假如把它从默认的0.001改成0.25，训练得到的模型先转inference，再转onnx，推理一些图，就比较会引发onnx的推理问题：“ Non-zero status code returned while running Gather node”，，大家如果有改这种yml配置，可以改回来再试一下啊。，，默认的总归不会有啥问题，，，，当然了，你说默认的我用着好好的，我干嘛去改呢，，，那是因为，我一个目标上会出现两个叠套的bbox，，我想改改nms看看管不管用来着，，，有没有大佬对一个目标的推理结果上有俩bbox这种现象有研究的啊？

Apr 29 '24 05:04 lingdu-xu

Are there any updates? I have the same problem, only this error with the Gather node falls right after starting an onnxruntime session with the PicoDet model

I am using onnxruntime-gpu version 1.18

Sep 18 '24 15:09 susanin1970