测试了小块实时流pcm数据,降噪的声音多了一些吱吱的声音,是写的有问题吗
import torch import soundfile as sf from librosa import istft import onnxruntime import numpy as np
data_type = np.int16 frame_size = 960 hop_length = 256 n_fft = 512 win_length = 512
#------------------init model & state session = onnxruntime.InferenceSession('gtcrn_simple.onnx', providers=['CPUExecutionProvider']) conv_cache = np.zeros([2, 1, 16, 16, 33], dtype="float32") tra_cache = np.zeros([2, 3, 1, 1, 16], dtype="float32") inter_cache = np.zeros([2, 1, 33, 16], dtype="float32")
Get the item size of the data type
item_size = data_type().itemsize print(item_size)
xx=0
fout = open('./16k_de.pcm', 'wb')
Open the PCM file for reading
with open('G:/test/16k.pcm', 'rb') as f: while True:
raw_data = f.read(frame_size * item_size)
if len(raw_data) == 0:
break
print(xx)
xx+=1
x = np.frombuffer(raw_data, dtype=data_type)
if len(x) < frame_size:
print(1111111)
break
x = x.astype(np.float32) / np.iinfo(data_type).max
x_tensor = torch.tensor(x)
# Compute STFT
stft_matrix = torch.stft(
x_tensor,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=torch.hann_window(win_length).pow(0.5),
return_complex=False
)[None]
# Perform inference
inputs = stft_matrix.numpy()
block_outputs = []
for i in range(inputs.shape[-2]):
out_i, conv_cache, tra_cache, inter_cache = session.run(
[],
{
'mix': inputs[..., i:i + 1, :],
'conv_cache': conv_cache,
'tra_cache': tra_cache,
'inter_cache': inter_cache
}
)
block_outputs.append(out_i)
block_outputs = np.concatenate(block_outputs, axis=2)
enhanced_block = istft(block_outputs[..., 0] + 1j * block_outputs[..., 1],
n_fft=n_fft, hop_length=hop_length, win_length=win_length,
window=np.hanning(win_length) ** 0.5)
# Denormalize the enhanced audio back to int16
enhanced_int16 = (enhanced_block * np.iinfo(data_type).max).astype(data_type)
fout.write( enhanced_int16.tobytes() )
Save the enhanced audio as PCM file
fout.close()
@Xiaobin-Rong
你可以测试一下和非流式读取PCM数据的推理是否有明显差别。如果有就是流式PCM读取的问题。