3 лет назад · 51ea14f910
--- a/funasr/bin/asr_inference.py
+++ b/funasr/bin/asr_inference.py
@@ -368,7 +368,7 @@ class Speech2Text:
 
				 #         except TooShortUttError as e:
			
 
				 #             logging.warning(f"Utterance {keys} {e}")
			
 
				 #             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			
 
				-#             results = [[" ", ["<space>"], [2], hyp]] * nbest
			
 
				+#             results = [[" ", ["sil"], [2], hyp]] * nbest
			
 
				 #
			
 
				 #         # Only supporting batch_size==1
			
 
				 #         key = keys[0]
			
@@ -575,7 +575,7 @@ def inference_modelscope(
 
				             except TooShortUttError as e:
			
 
				                 logging.warning(f"Utterance {keys} {e}")
			
 
				                 hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			
 
				-                results = [[" ", ["<space>"], [2], hyp]] * nbest
			
 
				+                results = [[" ", ["sil"], [2], hyp]] * nbest
			
 
				             
			
 
				             # Only supporting batch_size==1
			
 
				             key = keys[0]
			
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -227,6 +227,8 @@ class Speech2Text:
 
				         pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
			
 
				                                                                         predictor_outs[2], predictor_outs[3]
			
 
				         pre_token_length = pre_token_length.round().long()
			
 
				+        if torch.max(pre_token_length) < 1:
			
 
				+            return []
			
 
				         decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
			
 
				         decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
			
 
				 
			
@@ -394,7 +396,7 @@ class Speech2Text:
 
				 #         results = speech2text(**batch)
			
 
				 #         if len(results) < 1:
			
 
				 #             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			
 
				-#             results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
			
 
				+#             results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
			
 
				 #         time_end = time.time()
			
 
				 #         forward_time = time_end - time_beg
			
 
				 #         lfr_factor = results[0][-1]
			
@@ -621,7 +623,7 @@ def inference_modelscope(
 
				             results = speech2text(**batch)
			
 
				             if len(results) < 1:
			
 
				                 hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			
 
				-                results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
			
 
				+                results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
			
 
				             time_end = time.time()
			
 
				             forward_time = time_end - time_beg
			
 
				             lfr_factor = results[0][-1]
			
--- a/funasr/bin/asr_inference_paraformer_timestamp.py
+++ b/funasr/bin/asr_inference_paraformer_timestamp.py
@@ -410,7 +410,7 @@ def inference(
 
				         results = speech2text(**batch)
			
 
				         if len(results) < 1:
			
 
				             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			
 
				-            results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
			
 
				+            results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
			
 
				         time_end = time.time()
			
 
				         forward_time = time_end - time_beg
			
 
				         lfr_factor = results[0][-1]
			
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -235,6 +235,8 @@ class Speech2Text:
 
				 
			
 
				         predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
			
 
				         pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3]
			
 
				+        if torch.max(pre_token_length) < 1:
			
 
				+            return []
			
 
				         pre_token_length = pre_token_length.round().long()
			
 
				         decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
			
 
				         decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
			
@@ -602,7 +604,7 @@ def inference_modelscope(
 
				                     results = speech2text(**batch)
			
 
				                     if len(results) < 1:
			
 
				                         hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			
 
				-                        results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
			
 
				+                        results = [[" ", ["sil"], [2], 0, 1, 6]] * nbest
			
 
				                     time_end = time.time()
			
 
				                     forward_time = time_end - time_beg
			
 
				                     lfr_factor = results[0][-1]
			
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -391,7 +391,7 @@ class Speech2Text:
 
				 #         except TooShortUttError as e:
			
 
				 #             logging.warning(f"Utterance {keys} {e}")
			
 
				 #             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			
 
				-#             results = [[" ", ["<space>"], [2], hyp]] * nbest
			
 
				+#             results = [[" ", ["sil"], [2], hyp]] * nbest
			
 
				 #
			
 
				 #         # Only supporting batch_size==1
			
 
				 #         key = keys[0]
			
@@ -616,7 +616,7 @@ def inference_modelscope(
 
				             except TooShortUttError as e:
			
 
				                 logging.warning(f"Utterance {keys} {e}")
			
 
				                 hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			
 
				-                results = [[" ", ["<space>"], [2], hyp]] * nbest
			
 
				+                results = [[" ", ["sil"], [2], hyp]] * nbest
			
 
				     
			
 
				             # Only supporting batch_size==1
			
 
				             key = keys[0]