Failed to generate timestamp for parakeet-tdt-1.1b

Open anshulwadhawan opened this issue 1 year ago • 0 comments

Describe the bug

When I tried to generate timestamp with model: nvidia/parakeet-tdt-1.1b, I got following error,

ValueError: `char_offsets`: [{'char': [tensor(524)], 'start_offset': 2, 'end_offset': 3}, {'char': [tensor(106)], 'start_offset': 5, 'end_offset': 6}, {'char': [tensor(40)], 'start_offset': 6, 'end_offset': 7}, {'char': [tensor(556)], 'start_offset': 7, 'end_offset': 8}, {'char': [tensor(988)], 'start_offset': 9, 'end_offset': 10}, {'char': [tensor(8)], 'start_offset': 10, 'end_offset': 11}, {'char': [tensor(88)], 'start_offset': 11, 'end_offset': 12}, {'char': [tensor(1002)], 'start_offset': 12, 'end_offset': 13}] and `processed_tokens`: [524, 106, 40, 556, 988, 8, 88, 1002] have to be of the same length, but are: `len(offsets)`: 8 and `len(processed_tokens)`: 8

call stack,

----> 1 hypotheses = asr_model.transcribe(['test.wav'], return_hypotheses=True)

requirements/lib/python3.10/site-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
    113     def decorate_context(*args, **kwargs):
    114         with ctx_factory():
--> 115             return func(*args, **kwargs)
    116 
    117     return decorate_context

requirements/lib/python3.10/site-packages/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py in transcribe(self, audio, batch_size, return_hypotheses, partial_hypothesis, num_workers, channel_selector, augmentor, verbose, override_config)
    136             )
    137 
--> 138         return super().transcribe(
    139             audio=audio,
    140             batch_size=batch_size,

requirements/lib/python3.10/site-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
    113     def decorate_context(*args, **kwargs):
    114         with ctx_factory():
--> 115             return func(*args, **kwargs)
    116 
    117     return decorate_context

requirements/lib/python3.10/site-packages/nemo/collections/asr/models/rnnt_models.py in transcribe(self, audio, batch_size, return_hypotheses, partial_hypothesis, num_workers, channel_selector, augmentor, verbose, override_config)
    277             * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis.
    278         """
--> 279         return super().transcribe(
    280             audio=audio,
    281             batch_size=batch_size,

requirements/lib/python3.10/site-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
    113     def decorate_context(*args, **kwargs):
    114         with ctx_factory():
--> 115             return func(*args, **kwargs)
    116 
    117     return decorate_context

requirements/lib/python3.10/site-packages/nemo/collections/asr/parts/mixins/transcription.py in transcribe(self, audio, batch_size, return_hypotheses, num_workers, channel_selector, augmentor, verbose, override_config, **config_kwargs)
    275             generator = self.transcribe_generator(audio, override_config=transcribe_cfg)
    276 
--> 277             for processed_outputs in generator:
    278                 # Store results
    279                 if isinstance(processed_outputs, list):

requirements/lib/python3.10/site-packages/nemo/collections/asr/parts/mixins/transcription.py in transcribe_generator(self, audio, override_config)
    387                     # Run forward pass
    388                     model_outputs = self._transcribe_forward(test_batch, transcribe_cfg)
--> 389                     processed_outputs = self._transcribe_output_processing(model_outputs, transcribe_cfg)
    390 
    391                     # clear up memory

requirements/lib/python3.10/site-packages/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py in _transcribe_output_processing(self, outputs, trcfg)
    177     ) -> Tuple[List['Hypothesis'], List['Hypothesis']]:
    178         if self.cur_decoder == "rnnt":
--> 179             return super()._transcribe_output_processing(outputs, trcfg)
    180 
    181         # CTC Path

requirements/lib/python3.10/site-packages/nemo/collections/asr/models/rnnt_models.py in _transcribe_output_processing(self, outputs, trcfg)
    906         encoded_len = outputs.pop('encoded_len')
    907 
--> 908         best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor(
    909             encoded,
    910             encoded_len,

requirements/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/rnnt_decoding.py in rnnt_decoder_predictions_tensor(self, encoder_output, encoded_lengths, return_hypotheses, partial_hypotheses)
    508                 timestamp_type = self.cfg.get('rnnt_timestamp_type', 'all')
    509                 for hyp_idx in range(len(hypotheses)):
--> 510                     hypotheses[hyp_idx] = self.compute_rnnt_timestamps(hypotheses[hyp_idx], timestamp_type)
    511 
    512             if return_hypotheses:

requirements/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/rnnt_decoding.py in compute_rnnt_timestamps(self, hypothesis, timestamp_type)
    751 
    752         if num_flattened_tokens != len(hypothesis.text):
--> 753             raise ValueError(
    754                 f"`char_offsets`: {char_offsets} and `processed_tokens`: {hypothesis.text}"
    755                 " have to be of the same length, but are: "

Steps/Code to reproduce bug

asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-1.1b")
decoding_cfg = asr_model.cfg.decoding
with open_dict(decoding_cfg):
    decoding_cfg.preserve_alignments = True
    decoding_cfg.compute_timestamps = True
    asr_model.change_decoding_strategy(decoding_cfg)
asr_model.change_decoding_strategy(decoding_cfg)
hypothesis = asr_model.transcribe([audio], return_hypotheses=True)[0][0]
timestamp_dict = hypothesis.timestep
word_timestamps = timestamp_dict['word']
print(word_timestamps)

Expected behavior

It should output word timestamps instead of exception.

Environment overview (please complete the following information)

Environment location: run in local redhat 7.9 machine.
Method of NeMo install: pip install nemo_toolkit['asr']==2.0.0rc0

Environment details

If NVIDIA docker image is used you don't need to specify these. Otherwise, please provide:

OS version: redhat 7.9
PyTorch version: 2.0.1+cu117
Python version: 3.10.13

Additional context

This is a replica of the issue: https://github.com/NVIDIA/NeMo/issues/8451 which got closed due to inactivity.

Jun 26 '24 21:06 anshulwadhawan