VoxCPM
VoxCPM copied to clipboard
Ability to Be Ran From Background Thread Broken With 1.5
We're currently seeing our TTS returning back errors pertaining to torch compile and how it's causing the apps not to be able to run with background threads using VoxCPM 1.5.
We tested both pretrained models, openbmb/VoxCPM-0.5B & openbmb/VoxCPM1.5 and they are both not working with these errors. It was working fine with the previous version.
Exception in thread Thread-4 (_processing_worker):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/data/models/tts/sentenceLevelTTS.py", line 100, in _processing_worker
audio_path = self.tts_engine.synthesize(sentence, tmp_file.name, 8)
File "/data/models/tts/voxcpm.py", line 103, in synthesize
wav = self.model.generate(
File "/opt/venv/lib/python3.10/site-packages/voxcpm/core.py", line 130, in generate
return next(self._generate(*args, streaming=False, **kwargs))
File "/opt/venv/lib/python3.10/site-packages/voxcpm/core.py", line 224, in _generate
for wav, _, _ in generate_result:
File "/opt/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 38, in generator_context
response = gen.send(None)
File "/opt/venv/lib/python3.10/site-packages/voxcpm/model/voxcpm.py", line 680, in _generate_with_prompt_cache
latent_pred, pred_audio_feat = next(inference_result)
File "/opt/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 38, in generator_context
response = gen.send(None)
File "/opt/venv/lib/python3.10/site-packages/voxcpm/model/voxcpm.py", line 742, in _inference
feat_embed = self.feat_encoder(feat) # [b, t, h_feat]
File "/opt/venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 375, in __call__
return super().__call__(*args, **kwargs)
File "/opt/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 736, in compile_wrapper
return fn(*args, **kwargs)
File "/opt/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/venv/lib/python3.10/site-packages/voxcpm/modules/locenc/local_encoder.py", line 17, in forward
def forward(self, x):
File "/opt/venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 929, in _fn
return fn(*args, **kwargs)
File "/opt/venv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1241, in forward
return compiled_fn(full_args)
File "/opt/venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 384, in runtime_wrapper
all_outs = call_func_at_runtime_with_args(
File "/opt/venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 126, in call_func_at_runtime_with_args
out = normalize_as_list(f(args))
File "/opt/venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 750, in inner_fn
outs = compiled_fn(args)
File "/opt/venv/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 556, in wrapper
return compiled_fn(runtime_args)
File "/opt/venv/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 584, in __call__
return self.current_callable(inputs)
File "/opt/venv/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1655, in run
return compiled_fn(new_inputs) # type: ignore[arg-type]
File "/opt/venv/lib/python3.10/site-packages/torch/_inductor/cudagraph_trees.py", line 403, in deferred_cudagraphify
fn, out = cudagraphify(model, inputs, new_static_input_idxs, *args, **kwargs)
File "/opt/venv/lib/python3.10/site-packages/torch/_inductor/cudagraph_trees.py", line 460, in cudagraphify
manager = get_container(device_index).get_tree_manager()
File "/opt/venv/lib/python3.10/site-packages/torch/_inductor/cudagraph_trees.py", line 331, in get_container
container_dict = get_obj(local, "tree_manager_containers")
File "/opt/venv/lib/python3.10/site-packages/torch/_inductor/cudagraph_trees.py", line 326, in get_obj
assert torch._C._is_key_in_tls(attr_name)
AssertionError
Snippet of our inference code which is basically just the example code:
def synthesize(self, text: str, out_path: str, steps: int) -> str:
# Build kwargs based on model type
# cleanup text so that it is only valid utf-8
text = remove_emoji( text).strip()
print(f"Text: {text}")
if text == "":
return ""
wav = self.model.generate(
text=text,
prompt_wav_path=self.voice_wav, # optional: path to a prompt speech for voice cloning
prompt_text=self.voice_transcription, # optional: reference text
cfg_value=2.0, # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse
inference_timesteps=steps, # LocDiT inference timesteps, higher for better result, lower for fast speed
normalize=True, # enable external TN tool
denoise=True, # enable external Denoise tool
retry_badcase=True, # enable retrying mode for some bad cases (unstoppable)
retry_badcase_max_times=3, # maximum retrying times
retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech
)
self.ta.write(out_path, wav, 16000)
return out_path