NeMo
NeMo copied to clipboard
Failed to generate timestamp for parakeet-tdt-1.1b
Describe the bug
When I tried to generate timestamp with model: nvidia/parakeet-tdt-1.1b, I got following error,
ValueError: `char_offsets`: [{'char': [tensor(524)], 'start_offset': 2, 'end_offset': 3}, {'char': [tensor(106)], 'start_offset': 5, 'end_offset': 6}, {'char': [tensor(40)], 'start_offset': 6, 'end_offset': 7}, {'char': [tensor(556)], 'start_offset': 7, 'end_offset': 8}, {'char': [tensor(988)], 'start_offset': 9, 'end_offset': 10}, {'char': [tensor(8)], 'start_offset': 10, 'end_offset': 11}, {'char': [tensor(88)], 'start_offset': 11, 'end_offset': 12}, {'char': [tensor(1002)], 'start_offset': 12, 'end_offset': 13}] and `processed_tokens`: [524, 106, 40, 556, 988, 8, 88, 1002] have to be of the same length, but are: `len(offsets)`: 8 and `len(processed_tokens)`: 8
call stack,
----> 1 hypotheses = asr_model.transcribe(['test.wav'], return_hypotheses=True)
requirements/lib/python3.10/site-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
116
117 return decorate_context
requirements/lib/python3.10/site-packages/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py in transcribe(self, audio, batch_size, return_hypotheses, partial_hypothesis, num_workers, channel_selector, augmentor, verbose, override_config)
136 )
137
--> 138 return super().transcribe(
139 audio=audio,
140 batch_size=batch_size,
requirements/lib/python3.10/site-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
116
117 return decorate_context
requirements/lib/python3.10/site-packages/nemo/collections/asr/models/rnnt_models.py in transcribe(self, audio, batch_size, return_hypotheses, partial_hypothesis, num_workers, channel_selector, augmentor, verbose, override_config)
277 * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis.
278 """
--> 279 return super().transcribe(
280 audio=audio,
281 batch_size=batch_size,
requirements/lib/python3.10/site-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
116
117 return decorate_context
requirements/lib/python3.10/site-packages/nemo/collections/asr/parts/mixins/transcription.py in transcribe(self, audio, batch_size, return_hypotheses, num_workers, channel_selector, augmentor, verbose, override_config, **config_kwargs)
275 generator = self.transcribe_generator(audio, override_config=transcribe_cfg)
276
--> 277 for processed_outputs in generator:
278 # Store results
279 if isinstance(processed_outputs, list):
requirements/lib/python3.10/site-packages/nemo/collections/asr/parts/mixins/transcription.py in transcribe_generator(self, audio, override_config)
387 # Run forward pass
388 model_outputs = self._transcribe_forward(test_batch, transcribe_cfg)
--> 389 processed_outputs = self._transcribe_output_processing(model_outputs, transcribe_cfg)
390
391 # clear up memory
requirements/lib/python3.10/site-packages/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py in _transcribe_output_processing(self, outputs, trcfg)
177 ) -> Tuple[List['Hypothesis'], List['Hypothesis']]:
178 if self.cur_decoder == "rnnt":
--> 179 return super()._transcribe_output_processing(outputs, trcfg)
180
181 # CTC Path
requirements/lib/python3.10/site-packages/nemo/collections/asr/models/rnnt_models.py in _transcribe_output_processing(self, outputs, trcfg)
906 encoded_len = outputs.pop('encoded_len')
907
--> 908 best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor(
909 encoded,
910 encoded_len,
requirements/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/rnnt_decoding.py in rnnt_decoder_predictions_tensor(self, encoder_output, encoded_lengths, return_hypotheses, partial_hypotheses)
508 timestamp_type = self.cfg.get('rnnt_timestamp_type', 'all')
509 for hyp_idx in range(len(hypotheses)):
--> 510 hypotheses[hyp_idx] = self.compute_rnnt_timestamps(hypotheses[hyp_idx], timestamp_type)
511
512 if return_hypotheses:
requirements/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/rnnt_decoding.py in compute_rnnt_timestamps(self, hypothesis, timestamp_type)
751
752 if num_flattened_tokens != len(hypothesis.text):
--> 753 raise ValueError(
754 f"`char_offsets`: {char_offsets} and `processed_tokens`: {hypothesis.text}"
755 " have to be of the same length, but are: "
Steps/Code to reproduce bug
asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-1.1b")
decoding_cfg = asr_model.cfg.decoding
with open_dict(decoding_cfg):
decoding_cfg.preserve_alignments = True
decoding_cfg.compute_timestamps = True
asr_model.change_decoding_strategy(decoding_cfg)
asr_model.change_decoding_strategy(decoding_cfg)
hypothesis = asr_model.transcribe([audio], return_hypotheses=True)[0][0]
timestamp_dict = hypothesis.timestep
word_timestamps = timestamp_dict['word']
print(word_timestamps)
Expected behavior
It should output word timestamps instead of exception.
Environment overview (please complete the following information)
- Environment location: run in local redhat 7.9 machine.
- Method of NeMo install: pip install nemo_toolkit['asr']==2.0.0rc0
Environment details
If NVIDIA docker image is used you don't need to specify these. Otherwise, please provide:
- OS version: redhat 7.9
- PyTorch version: 2.0.1+cu117
- Python version: 3.10.13
Additional context
This is a replica of the issue: https://github.com/NVIDIA/NeMo/issues/8451 which got closed due to inactivity.