测试了小块实时流pcm数据，降噪的声音多了一些吱吱的声音，是写的有问题吗

Open yuexiajiayan opened this issue 1 year ago • 1 comments

import torch import soundfile as sf from librosa import istft import onnxruntime import numpy as np

data_type = np.int16 frame_size = 960 hop_length = 256 n_fft = 512 win_length = 512

#------------------init model & state session = onnxruntime.InferenceSession('gtcrn_simple.onnx', providers=['CPUExecutionProvider']) conv_cache = np.zeros([2, 1, 16, 16, 33], dtype="float32") tra_cache = np.zeros([2, 3, 1, 1, 16], dtype="float32") inter_cache = np.zeros([2, 1, 33, 16], dtype="float32")

Get the item size of the data type

item_size = data_type().itemsize print(item_size)

xx=0

fout = open('./16k_de.pcm', 'wb')

Open the PCM file for reading

with open('G:/test/16k.pcm', 'rb') as f: while True:

    raw_data = f.read(frame_size * item_size)
    if len(raw_data) == 0:
        break
    print(xx)
    xx+=1
    
    x = np.frombuffer(raw_data, dtype=data_type)
    
    
    if len(x) < frame_size:
       print(1111111)
       break
    
   
    x = x.astype(np.float32) / np.iinfo(data_type).max
    
   
    x_tensor = torch.tensor(x)
     
    # Compute STFT
    stft_matrix = torch.stft(
        x_tensor,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window=torch.hann_window(win_length).pow(0.5),
        return_complex=False
    )[None]
    
    # Perform inference
    inputs = stft_matrix.numpy()
   
    block_outputs = []
    for i in range(inputs.shape[-2]):
        
        out_i, conv_cache, tra_cache, inter_cache = session.run(
            [],
            {
                'mix': inputs[..., i:i + 1, :],
                'conv_cache': conv_cache,
                'tra_cache': tra_cache,
                'inter_cache': inter_cache
            }
        )
        block_outputs.append(out_i)
    
    block_outputs = np.concatenate(block_outputs, axis=2)
    enhanced_block = istft(block_outputs[..., 0] + 1j * block_outputs[..., 1], 
                            n_fft=n_fft, hop_length=hop_length, win_length=win_length, 
                            window=np.hanning(win_length) ** 0.5) 
    # Denormalize the enhanced audio back to int16
    enhanced_int16 = (enhanced_block * np.iinfo(data_type).max).astype(data_type)
    fout.write( enhanced_int16.tobytes() )

Save the enhanced audio as PCM file

fout.close()

@Xiaobin-Rong

Mar 26 '25 05:03 yuexiajiayan

你可以测试一下和非流式读取PCM数据的推理是否有明显差别。如果有就是流式PCM读取的问题。

Mar 27 '25 03:03 Xiaobin-Rong