Very slow motion speech for some combinations

Open computersrmyfriends opened this issue 2 years ago • 0 comments

def text_to_speech(text, tts_model_name, vocoder_model_name, max_length=100, file_name_prefix=""):
    torch.set_grad_enabled(False)

    sampling_rates = {
        "tts_en_tacotron2": 22050,
        "tts_en_fastpitch": 22050,
        "tts_en_fastpitch_ipa": 22050,
        "tts_en_fastpitch_multispeaker": 44100,
        "tts_de_fastpitch_singleSpeaker_thorstenNeutral_2102": 22050,
        "tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210": 22050,
        "tts_de_fastpitch_multispeaker_5": 44100,
        "tts_es_fastpitch_multispeaker": 44100,
        "tts_zh_fastpitch_sfspeech": 22050,
        "tts_en_waveglow_88m": 22050,
        "tts_en_hifigan": 22050,
        "tts_en_lj_hifigan_ft_mixertts": 22050,
        "tts_en_lj_hifigan_ft_mixerttsx": 22050,
        "tts_en_hifitts_hifigan_ft_fastpitch": 44100,
        "tts_de_hifigan_singleSpeaker_thorstenNeutral_2102": 22050,
        "tts_de_hifigan_singleSpeaker_thorstenNeutral_2210": 22050,
        "tts_de_hui_hifigan_ft_fastpitch_multispeaker_5": 44100,
        "tts_es_hifigan_ft_fastpitch_multispeaker": 44100,
        "tts_zh_hifigan_sfspeech": 22050
    }

    # Load TTS model
    if "tacotron" in tts_model_name.lower():
        tts_model = nemo_tts.models.Tacotron2Model.from_pretrained(model_name=tts_model_name)
    elif "fastpitch" in tts_model_name.lower():
        tts_model = nemo_tts.models.FastPitchModel.from_pretrained(model_name=tts_model_name)
    else:
        raise ValueError("Unsupported TTS model name")

    # Load vocoder model
    if "waveglow" in vocoder_model_name.lower():
        vocoder = nemo_tts.models.WaveGlowModel.from_pretrained(model_name=vocoder_model_name)
    elif "hifigan" in vocoder_model_name.lower():
        vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name=vocoder_model_name)
    else:
        raise ValueError("Unsupported vocoder model name")

    tts_model.eval()
    vocoder.eval()

    # Split text into chunks
    chunks = split_text(text, max_length)
    full_audio = []

    for chunk in chunks:
        parsed = tts_model.parse(chunk)
        spectrogram = tts_model.generate_spectrogram(tokens=parsed)
        audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
        audio_numpy = audio.to().numpy().T
        full_audio.extend(audio_numpy)

    full_audio_np = np.hstack(full_audio)
    file_name = f'samples1/{file_name_prefix}_{tts_model_name}_{vocoder_model_name}.wav'

# Determine the correct sampling rate
    tts_samplerate = sampling_rates.get(tts_model_name.split('_')[1], 22050)  # Get default based on TTS model type
    vocoder_samplerate = sampling_rates.get(vocoder_model_name.split('_')[1], 22050)  # Get default based on vocoder model type
    samplerate = max(tts_samplerate, vocoder_samplerate)  # Choose the higher rate if they differ
    sf.write(file_name, full_audio_np, samplerate)
    return file_name

# List all TTS and vocoder models
tts_models = [model.pretrained_model_name for model in nemo_tts.models.Tacotron2Model.list_available_models()] + \
             [model.pretrained_model_name for model in nemo_tts.models.FastPitchModel.list_available_models()]

vocoder_models = [model.pretrained_model_name for model in nemo_tts.models.WaveGlowModel.list_available_models()] + \
                  [model.pretrained_model_name for model in nemo_tts.models.HifiGanModel.list_available_models()]

Here's what I do for splitting since long texts dont work out afaik,

def split_text(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = ""

    for word in words:
        # Check if adding the word would exceed the max length
        if len(current_chunk) + len(word) + 1 > max_length:
            # If the chunk is too long and doesn't end with punctuation, add it as is
            if not current_chunk[-1] in ".!?":
                chunks.append(current_chunk)
                current_chunk = word
            else:
                # If it ends with punctuation, find the nearest punctuation to split
                nearest_punct = max(current_chunk.rfind("."), current_chunk.rfind("!"), current_chunk.rfind("?"))
                split_at = nearest_punct + 1
                chunks.append(current_chunk[:split_at])
                current_chunk = current_chunk[split_at:].strip() + " " + word
        else:
            current_chunk = current_chunk + " " + word if current_chunk else word

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

And then, When I use this code to test different model combinations like this,

# Generate speech for each combination of TTS and vocoder models
for tts_model_name in tts_models:
    for vocoder_model_name in vocoder_models:
        print(f"Generating speech with TTS model: {tts_model_name} and vocoder model: {vocoder_model_name}")
        text_to_speech(longtext, tts_model_name, vocoder_model_name, max_length=100, file_name_prefix="speech_output")

Among the resultant files, many are in slow motion. like these combinations:

_tts_en_tacotron2_tts_de_hui_hifigan_ft_fastpitch_multispeaker.wav speech_output__tts_en_tacotron2_tts_es_hifigan_ft_fastpitch_multispeaker.wav speech_output__tts_en_fastpitch_ipa_tts_es_hifigan_ft_fastpitch_multispeaker.wav speech_output__tts_en_fastpitch_ipa_tts_en_hifitts_hifigan_ft_fastpitch.wav

Multispeaker always results in extreme slow motion low pitch voices. As you can see, I have tried setting the frequencies too based on my limited knowledge.

Any thoughts?

Feb 05 '24 11:02 computersrmyfriends