Retrieval-based-Voice-Conversion-WebUI onnx问题请教

问题描述：我参考infer/modules/onnx/export.py尝试将net_g的torch模型转onnx，遇到一些问题，望大佬帮忙看看：

onnx推理时不支持动态长度，export.py中默认hubert长度为200，转成onnx后也只能推理hubert长度为200的输入，其他长度的会报错；虽然问题通过简单的方式解决了，但是不知道会不会影响结果，我的解决方式是：根据转onnx的warning内容，把infer/lib/infer_pack/attentions.py中用到的int删去。
即使在不修改代码，并且保持onnx推理时hubert长度为200的情况下，对比torch模型的输出和onnx模型的输出，发现输出结果差别挺大的，不知道是为什么呢？

如下是运行代码：

import torch
import sys
sys.path.append("/czc/Retrieval-based-Voice-Conversion-WebUI-main")
from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
import time
def export_onnx(ModelPath, ExportedPath):
    cpt = torch.load(ModelPath, map_location="cpu")
    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
    vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768

    test_phone = torch.rand(1, 200, vec_channels)  # hidden unit
    test_phone_lengths = torch.tensor([200]).long()  # hidden unit 长度（貌似没啥用）
    test_pitch = torch.randint(size=(1, 200), low=5, high=255)  # 基频（单位赫兹）
    test_pitchf = torch.rand(1, 200)  # nsf基频
    test_ds = torch.LongTensor([0])  # 说话人ID
    test_rnd = torch.rand(1, 192, 200)  # 噪声（加入随机因子）
    device = "cpu"  # 导出时设备（不影响使用模型）
    net_g = SynthesizerTrnMsNSFsidM(
        *cpt["config"], is_half=False, version=cpt.get("version", "v1")
    )  # fp32导出（C++要支持fp16必须手动将内存重新排列所以暂时不用fp16）
    net_g.load_state_dict(cpt["weight"], strict=False)
    input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
    output_names = [
        "audio",
    ]
    # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
    torch.onnx.export(
        net_g,
        (
            test_phone.to(device),
            test_phone_lengths.to(device),
            test_pitch.to(device),
            test_pitchf.to(device),
            test_ds.to(device),
            test_rnd.to(device),
        ),
        ExportedPath,
        dynamic_axes={
            "phone": [1],
            "pitch": [1],
            "pitchf": [1],
            "rnd": [2],
        },
        do_constant_folding=False,
        opset_version=13,
        verbose=False,
        input_names=input_names,
        output_names=output_names,
    )
    return "Finished"
if __name__ == "__main__":
    # 有bug导致动态维度失效
    # infer/lib/infer_pack/attentions.py 将export中报错的地方的int去掉
    ModelPath = "/czc/Retrieval-based-Voice-Conversion-WebUI-main/assets/weights/nvzhubo.pth"
    model_path = "/czc/Retrieval-based-Voice-Conversion-WebUI-main/assets/weights/nvzhubo.onnx"
    # 转onnx
    export_onnx(ModelPath, model_path)
    cpt = torch.load(ModelPath, map_location="cpu")
    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
    vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
    test_phone = torch.rand(1, 200, vec_channels)  # hidden unit
    test_phone_lengths = torch.tensor([200]).long()  # hidden unit 长度（貌似没啥用）
    test_pitch = torch.randint(size=(1, 200), low=5, high=255)  # 基频（单位赫兹）
    test_pitchf = torch.rand(1, 200)  # nsf基频
    test_ds = torch.LongTensor([0])  # 说话人ID
    test_rnd = torch.zeros(1, 192, 200)  # 噪声（加入随机因子）
    time1 = time.time()
    net_g = SynthesizerTrnMsNSFsidM(
        *cpt["config"], is_half=False, version=cpt.get("version", "v1")
    )  # fp32导出（C++要支持fp16必须手动将内存重新排列所以暂时不用fp16）
    net_g.eval()
    out1 = net_g(test_phone, test_phone_lengths, test_pitch, test_pitchf, test_ds, test_rnd)
    print(time.time()-time1)
    print(out1.shape)
    print(out1[0,0,:5])
    providers = ["CPUExecutionProvider"]
    # 保持输入一致
    test_phone = test_phone.numpy()  # hidden unit
    test_phone_lengths = test_phone_lengths.numpy()  # hidden unit 长度（貌似没啥用）
    test_pitch = test_pitch.numpy()  # 基频（单位赫兹）
    test_pitchf = test_pitchf.numpy()  # nsf基频
    test_ds = test_ds.numpy()  # 说话人ID
    test_rnd = test_rnd.numpy()  # 噪声（加入随机因子）
    input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
    onnx_input = {
        "phone": test_phone,
        "phone_lengths": test_phone_lengths,
        "pitch": test_pitch,
        "pitchf": test_pitchf,
        "ds": test_ds,
        "rnd": test_rnd,
    }
    import onnxruntime
    import numpy as np
    onnx = onnxruntime.InferenceSession(model_path, providers=providers)
    time2 = time.time()
    out2 = torch.tensor(onnx.run(None, onnx_input)[0])
    # out2 = torch.tensor((onnx.run(None, onnx_input)[0] * 32767).astype(np.int16))
    print(time.time()-time2)
    print(out2.shape)
    print(out2[0,0,:5])

如下是两模型的输出结果：

torch.Size([1, 1, 80000])
tensor([0.0870, 0.0332, 0.1191, 0.2330, 0.2877], grad_fn=<SliceBackward0>)
torch.Size([1, 1, 80000])
tensor([0.0115, 0.0109, 0.0093, 0.0084, 0.0084])

如下是转onnx的输出日志：

/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:2033: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input phone
  warnings.warn(
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:2033: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input pitch
  warnings.warn(
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:2033: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input pitchf
  warnings.warn(
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:2033: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input rnd
  warnings.warn(
/czc/Retrieval-based-Voice-Conversion-WebUI-main/infer/lib/infer_pack/attentions.py:249: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  t_s == t_t
/czc/Retrieval-based-Voice-Conversion-WebUI-main/infer/lib/infer_pack/attentions.py:311: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  pad_length: int = max(length - (self.window_size + 1), 0)
/czc/Retrieval-based-Voice-Conversion-WebUI-main/infer/lib/infer_pack/attentions.py:312: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  slice_start_position = max((self.window_size + 1) - length, 0)
/czc/Retrieval-based-Voice-Conversion-WebUI-main/infer/lib/infer_pack/attentions.py:314: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if pad_length > 0:
/czc/Retrieval-based-Voice-Conversion-WebUI-main/infer/lib/infer_pack/attentions.py:366: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  x_flat = x.view([batch, heads, int(length**2) + int(length * (length - 1))])
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/_internal/jit_utils.py:306: UserWarning: Constant folding - Only steps=1 can be constant folded for opset >= 10 onnx::Slice op. Constant folding not applied. (Triggered internally at ../torch/csrc/jit/passes/onnx/constant_fold.cpp:179.)
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:689: UserWarning: Constant folding - Only steps=1 can be constant folded for opset >= 10 onnx::Slice op. Constant folding not applied. (Triggered internally at ../torch/csrc/jit/passes/onnx/constant_fold.cpp:179.)
  _C._jit_pass_onnx_graph_shape_type_inference(
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:1186: UserWarning: Constant folding - Only steps=1 can be constant folded for opset >= 10 onnx::Slice op. Constant folding not applied. (Triggered internally at ../torch/csrc/jit/passes/onnx/constant_fold.cpp:179.)
  _C._jit_pass_onnx_graph_shape_type_inference(
============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============
verbose: False, log level: Level.ERROR
======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================
****