Retrieval-based-Voice-Conversion-WebUI
Retrieval-based-Voice-Conversion-WebUI copied to clipboard
onnx问题请教
问题描述:我参考infer/modules/onnx/export.py尝试将net_g的torch模型转onnx,遇到一些问题,望大佬帮忙看看:
- onnx推理时不支持动态长度,export.py中默认hubert长度为200,转成onnx后也只能推理hubert长度为200的输入,其他长度的会报错;虽然问题通过简单的方式解决了,但是不知道会不会影响结果,我的解决方式是:根据转onnx的warning内容,把infer/lib/infer_pack/attentions.py中用到的int删去。
- 即使在不修改代码,并且保持onnx推理时hubert长度为200的情况下,对比torch模型的输出和onnx模型的输出,发现输出结果差别挺大的,不知道是为什么呢?
如下是运行代码:
import torch
import sys
sys.path.append("/czc/Retrieval-based-Voice-Conversion-WebUI-main")
from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
import time
def export_onnx(ModelPath, ExportedPath):
cpt = torch.load(ModelPath, map_location="cpu")
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
test_phone = torch.rand(1, 200, vec_channels) # hidden unit
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
test_pitchf = torch.rand(1, 200) # nsf基频
test_ds = torch.LongTensor([0]) # 说话人ID
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
device = "cpu" # 导出时设备(不影响使用模型)
net_g = SynthesizerTrnMsNSFsidM(
*cpt["config"], is_half=False, version=cpt.get("version", "v1")
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
net_g.load_state_dict(cpt["weight"], strict=False)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
output_names = [
"audio",
]
# net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
torch.onnx.export(
net_g,
(
test_phone.to(device),
test_phone_lengths.to(device),
test_pitch.to(device),
test_pitchf.to(device),
test_ds.to(device),
test_rnd.to(device),
),
ExportedPath,
dynamic_axes={
"phone": [1],
"pitch": [1],
"pitchf": [1],
"rnd": [2],
},
do_constant_folding=False,
opset_version=13,
verbose=False,
input_names=input_names,
output_names=output_names,
)
return "Finished"
if __name__ == "__main__":
# 有bug导致动态维度失效
# infer/lib/infer_pack/attentions.py 将export中报错的地方的int去掉
ModelPath = "/czc/Retrieval-based-Voice-Conversion-WebUI-main/assets/weights/nvzhubo.pth"
model_path = "/czc/Retrieval-based-Voice-Conversion-WebUI-main/assets/weights/nvzhubo.onnx"
# 转onnx
export_onnx(ModelPath, model_path)
cpt = torch.load(ModelPath, map_location="cpu")
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
test_phone = torch.rand(1, 200, vec_channels) # hidden unit
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
test_pitchf = torch.rand(1, 200) # nsf基频
test_ds = torch.LongTensor([0]) # 说话人ID
test_rnd = torch.zeros(1, 192, 200) # 噪声(加入随机因子)
time1 = time.time()
net_g = SynthesizerTrnMsNSFsidM(
*cpt["config"], is_half=False, version=cpt.get("version", "v1")
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
net_g.eval()
out1 = net_g(test_phone, test_phone_lengths, test_pitch, test_pitchf, test_ds, test_rnd)
print(time.time()-time1)
print(out1.shape)
print(out1[0,0,:5])
providers = ["CPUExecutionProvider"]
# 保持输入一致
test_phone = test_phone.numpy() # hidden unit
test_phone_lengths = test_phone_lengths.numpy() # hidden unit 长度(貌似没啥用)
test_pitch = test_pitch.numpy() # 基频(单位赫兹)
test_pitchf = test_pitchf.numpy() # nsf基频
test_ds = test_ds.numpy() # 说话人ID
test_rnd = test_rnd.numpy() # 噪声(加入随机因子)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
onnx_input = {
"phone": test_phone,
"phone_lengths": test_phone_lengths,
"pitch": test_pitch,
"pitchf": test_pitchf,
"ds": test_ds,
"rnd": test_rnd,
}
import onnxruntime
import numpy as np
onnx = onnxruntime.InferenceSession(model_path, providers=providers)
time2 = time.time()
out2 = torch.tensor(onnx.run(None, onnx_input)[0])
# out2 = torch.tensor((onnx.run(None, onnx_input)[0] * 32767).astype(np.int16))
print(time.time()-time2)
print(out2.shape)
print(out2[0,0,:5])
如下是两模型的输出结果:
torch.Size([1, 1, 80000])
tensor([0.0870, 0.0332, 0.1191, 0.2330, 0.2877], grad_fn=<SliceBackward0>)
torch.Size([1, 1, 80000])
tensor([0.0115, 0.0109, 0.0093, 0.0084, 0.0084])
如下是转onnx的输出日志:
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:2033: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input phone
warnings.warn(
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:2033: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input pitch
warnings.warn(
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:2033: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input pitchf
warnings.warn(
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:2033: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input rnd
warnings.warn(
/czc/Retrieval-based-Voice-Conversion-WebUI-main/infer/lib/infer_pack/attentions.py:249: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
t_s == t_t
/czc/Retrieval-based-Voice-Conversion-WebUI-main/infer/lib/infer_pack/attentions.py:311: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
pad_length: int = max(length - (self.window_size + 1), 0)
/czc/Retrieval-based-Voice-Conversion-WebUI-main/infer/lib/infer_pack/attentions.py:312: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
slice_start_position = max((self.window_size + 1) - length, 0)
/czc/Retrieval-based-Voice-Conversion-WebUI-main/infer/lib/infer_pack/attentions.py:314: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
if pad_length > 0:
/czc/Retrieval-based-Voice-Conversion-WebUI-main/infer/lib/infer_pack/attentions.py:366: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
x_flat = x.view([batch, heads, int(length**2) + int(length * (length - 1))])
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/_internal/jit_utils.py:306: UserWarning: Constant folding - Only steps=1 can be constant folded for opset >= 10 onnx::Slice op. Constant folding not applied. (Triggered internally at ../torch/csrc/jit/passes/onnx/constant_fold.cpp:179.)
_C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:689: UserWarning: Constant folding - Only steps=1 can be constant folded for opset >= 10 onnx::Slice op. Constant folding not applied. (Triggered internally at ../torch/csrc/jit/passes/onnx/constant_fold.cpp:179.)
_C._jit_pass_onnx_graph_shape_type_inference(
/root/miniconda3/envs/valle/lib/python3.10/site-packages/torch/onnx/utils.py:1186: UserWarning: Constant folding - Only steps=1 can be constant folded for opset >= 10 onnx::Slice op. Constant folding not applied. (Triggered internally at ../torch/csrc/jit/passes/onnx/constant_fold.cpp:179.)
_C._jit_pass_onnx_graph_shape_type_inference(
============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============
verbose: False, log level: Level.ERROR
======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================
****
破案了 问题出在SineGen
@Nian-Chen 请问结果差异大的问题怎么解决
破案了 问题出在SineGen
请问如何解决sinegen的问题
破案了 问题出在SineGen
请问如何解决sinegen的问题
我的方案是把这部分剥离出来,用torch计算
请问sinegen的问题具体怎么改 @Nian-Chen
请问底膜hubert_base.pt转成hubert_base.onnx,你们实现了吗?