Просмотр исходного кода

fix paraformer bug, when predicts no token, silence

游雁 3 лет назад
Родитель
Сommit
51ea14f910

+ 2 - 2
funasr/bin/asr_inference.py

@@ -368,7 +368,7 @@ class Speech2Text:
 #         except TooShortUttError as e:
 #             logging.warning(f"Utterance {keys} {e}")
 #             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-#             results = [[" ", ["<space>"], [2], hyp]] * nbest
+#             results = [[" ", ["sil"], [2], hyp]] * nbest
 #
 #         # Only supporting batch_size==1
 #         key = keys[0]
@@ -575,7 +575,7 @@ def inference_modelscope(
             except TooShortUttError as e:
                 logging.warning(f"Utterance {keys} {e}")
                 hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-                results = [[" ", ["<space>"], [2], hyp]] * nbest
+                results = [[" ", ["sil"], [2], hyp]] * nbest
             
             # Only supporting batch_size==1
             key = keys[0]

+ 4 - 2
funasr/bin/asr_inference_paraformer.py

@@ -227,6 +227,8 @@ class Speech2Text:
         pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
                                                                         predictor_outs[2], predictor_outs[3]
         pre_token_length = pre_token_length.round().long()
+        if torch.max(pre_token_length) < 1:
+            return []
         decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
         decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
 
@@ -394,7 +396,7 @@ class Speech2Text:
 #         results = speech2text(**batch)
 #         if len(results) < 1:
 #             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-#             results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
+#             results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
 #         time_end = time.time()
 #         forward_time = time_end - time_beg
 #         lfr_factor = results[0][-1]
@@ -621,7 +623,7 @@ def inference_modelscope(
             results = speech2text(**batch)
             if len(results) < 1:
                 hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-                results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
+                results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
             time_end = time.time()
             forward_time = time_end - time_beg
             lfr_factor = results[0][-1]

+ 1 - 1
funasr/bin/asr_inference_paraformer_timestamp.py

@@ -410,7 +410,7 @@ def inference(
         results = speech2text(**batch)
         if len(results) < 1:
             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-            results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
+            results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
         time_end = time.time()
         forward_time = time_end - time_beg
         lfr_factor = results[0][-1]

+ 3 - 1
funasr/bin/asr_inference_paraformer_vad_punc.py

@@ -235,6 +235,8 @@ class Speech2Text:
 
         predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
         pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3]
+        if torch.max(pre_token_length) < 1:
+            return []
         pre_token_length = pre_token_length.round().long()
         decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
         decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
@@ -602,7 +604,7 @@ def inference_modelscope(
                     results = speech2text(**batch)
                     if len(results) < 1:
                         hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-                        results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
+                        results = [[" ", ["sil"], [2], 0, 1, 6]] * nbest
                     time_end = time.time()
                     forward_time = time_end - time_beg
                     lfr_factor = results[0][-1]

+ 2 - 2
funasr/bin/asr_inference_uniasr.py

@@ -391,7 +391,7 @@ class Speech2Text:
 #         except TooShortUttError as e:
 #             logging.warning(f"Utterance {keys} {e}")
 #             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-#             results = [[" ", ["<space>"], [2], hyp]] * nbest
+#             results = [[" ", ["sil"], [2], hyp]] * nbest
 #
 #         # Only supporting batch_size==1
 #         key = keys[0]
@@ -616,7 +616,7 @@ def inference_modelscope(
             except TooShortUttError as e:
                 logging.warning(f"Utterance {keys} {e}")
                 hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-                results = [[" ", ["<space>"], [2], hyp]] * nbest
+                results = [[" ", ["sil"], [2], hyp]] * nbest
     
             # Only supporting batch_size==1
             key = keys[0]