|
|
@@ -235,23 +235,23 @@ class BiCifParaformer(Paraformer):
|
|
|
self.nbest = kwargs.get("nbest", 1)
|
|
|
|
|
|
meta_data = {}
|
|
|
- if isinstance(data_in, torch.Tensor): # fbank
|
|
|
- speech, speech_lengths = data_in, data_lengths
|
|
|
- if len(speech.shape) < 3:
|
|
|
- speech = speech[None, :, :]
|
|
|
- if speech_lengths is None:
|
|
|
- speech_lengths = speech.shape[1]
|
|
|
- else:
|
|
|
- # extract fbank feats
|
|
|
- time1 = time.perf_counter()
|
|
|
- audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
|
|
|
- time2 = time.perf_counter()
|
|
|
- meta_data["load_data"] = f"{time2 - time1:0.3f}"
|
|
|
- speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
|
|
|
- frontend=frontend)
|
|
|
- time3 = time.perf_counter()
|
|
|
- meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
|
|
|
- meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
|
|
|
+ # if isinstance(data_in, torch.Tensor): # fbank
|
|
|
+ # speech, speech_lengths = data_in, data_lengths
|
|
|
+ # if len(speech.shape) < 3:
|
|
|
+ # speech = speech[None, :, :]
|
|
|
+ # if speech_lengths is None:
|
|
|
+ # speech_lengths = speech.shape[1]
|
|
|
+ # else:
|
|
|
+ # extract fbank feats
|
|
|
+ time1 = time.perf_counter()
|
|
|
+ audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
|
|
|
+ time2 = time.perf_counter()
|
|
|
+ meta_data["load_data"] = f"{time2 - time1:0.3f}"
|
|
|
+ speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
|
|
|
+ frontend=frontend)
|
|
|
+ time3 = time.perf_counter()
|
|
|
+ meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
|
|
|
+ meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
|
|
|
|
|
|
speech = speech.to(device=kwargs["device"])
|
|
|
speech_lengths = speech_lengths.to(device=kwargs["device"])
|