NeMo
NeMo copied to clipboard
Very slow motion speech for some combinations
def text_to_speech(text, tts_model_name, vocoder_model_name, max_length=100, file_name_prefix=""):
torch.set_grad_enabled(False)
sampling_rates = {
"tts_en_tacotron2": 22050,
"tts_en_fastpitch": 22050,
"tts_en_fastpitch_ipa": 22050,
"tts_en_fastpitch_multispeaker": 44100,
"tts_de_fastpitch_singleSpeaker_thorstenNeutral_2102": 22050,
"tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210": 22050,
"tts_de_fastpitch_multispeaker_5": 44100,
"tts_es_fastpitch_multispeaker": 44100,
"tts_zh_fastpitch_sfspeech": 22050,
"tts_en_waveglow_88m": 22050,
"tts_en_hifigan": 22050,
"tts_en_lj_hifigan_ft_mixertts": 22050,
"tts_en_lj_hifigan_ft_mixerttsx": 22050,
"tts_en_hifitts_hifigan_ft_fastpitch": 44100,
"tts_de_hifigan_singleSpeaker_thorstenNeutral_2102": 22050,
"tts_de_hifigan_singleSpeaker_thorstenNeutral_2210": 22050,
"tts_de_hui_hifigan_ft_fastpitch_multispeaker_5": 44100,
"tts_es_hifigan_ft_fastpitch_multispeaker": 44100,
"tts_zh_hifigan_sfspeech": 22050
}
# Load TTS model
if "tacotron" in tts_model_name.lower():
tts_model = nemo_tts.models.Tacotron2Model.from_pretrained(model_name=tts_model_name)
elif "fastpitch" in tts_model_name.lower():
tts_model = nemo_tts.models.FastPitchModel.from_pretrained(model_name=tts_model_name)
else:
raise ValueError("Unsupported TTS model name")
# Load vocoder model
if "waveglow" in vocoder_model_name.lower():
vocoder = nemo_tts.models.WaveGlowModel.from_pretrained(model_name=vocoder_model_name)
elif "hifigan" in vocoder_model_name.lower():
vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name=vocoder_model_name)
else:
raise ValueError("Unsupported vocoder model name")
tts_model.eval()
vocoder.eval()
# Split text into chunks
chunks = split_text(text, max_length)
full_audio = []
for chunk in chunks:
parsed = tts_model.parse(chunk)
spectrogram = tts_model.generate_spectrogram(tokens=parsed)
audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
audio_numpy = audio.to().numpy().T
full_audio.extend(audio_numpy)
full_audio_np = np.hstack(full_audio)
file_name = f'samples1/{file_name_prefix}_{tts_model_name}_{vocoder_model_name}.wav'
# Determine the correct sampling rate
tts_samplerate = sampling_rates.get(tts_model_name.split('_')[1], 22050) # Get default based on TTS model type
vocoder_samplerate = sampling_rates.get(vocoder_model_name.split('_')[1], 22050) # Get default based on vocoder model type
samplerate = max(tts_samplerate, vocoder_samplerate) # Choose the higher rate if they differ
sf.write(file_name, full_audio_np, samplerate)
return file_name
# List all TTS and vocoder models
tts_models = [model.pretrained_model_name for model in nemo_tts.models.Tacotron2Model.list_available_models()] + \
[model.pretrained_model_name for model in nemo_tts.models.FastPitchModel.list_available_models()]
vocoder_models = [model.pretrained_model_name for model in nemo_tts.models.WaveGlowModel.list_available_models()] + \
[model.pretrained_model_name for model in nemo_tts.models.HifiGanModel.list_available_models()]
Here's what I do for splitting since long texts dont work out afaik,
def split_text(text, max_length):
words = text.split()
chunks = []
current_chunk = ""
for word in words:
# Check if adding the word would exceed the max length
if len(current_chunk) + len(word) + 1 > max_length:
# If the chunk is too long and doesn't end with punctuation, add it as is
if not current_chunk[-1] in ".!?":
chunks.append(current_chunk)
current_chunk = word
else:
# If it ends with punctuation, find the nearest punctuation to split
nearest_punct = max(current_chunk.rfind("."), current_chunk.rfind("!"), current_chunk.rfind("?"))
split_at = nearest_punct + 1
chunks.append(current_chunk[:split_at])
current_chunk = current_chunk[split_at:].strip() + " " + word
else:
current_chunk = current_chunk + " " + word if current_chunk else word
if current_chunk:
chunks.append(current_chunk)
return chunks
And then, When I use this code to test different model combinations like this,
# Generate speech for each combination of TTS and vocoder models
for tts_model_name in tts_models:
for vocoder_model_name in vocoder_models:
print(f"Generating speech with TTS model: {tts_model_name} and vocoder model: {vocoder_model_name}")
text_to_speech(longtext, tts_model_name, vocoder_model_name, max_length=100, file_name_prefix="speech_output")
Among the resultant files, many are in slow motion. like these combinations:
_tts_en_tacotron2_tts_de_hui_hifigan_ft_fastpitch_multispeaker.wav speech_output__tts_en_tacotron2_tts_es_hifigan_ft_fastpitch_multispeaker.wav speech_output__tts_en_fastpitch_ipa_tts_es_hifigan_ft_fastpitch_multispeaker.wav speech_output__tts_en_fastpitch_ipa_tts_en_hifitts_hifigan_ft_fastpitch.wav
Multispeaker always results in extreme slow motion low pitch voices. As you can see, I have tried setting the frequencies too based on my limited knowledge.
Any thoughts?