il y a 3 ans · 946000a29a
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
@@ -22,7 +22,7 @@ if __name__ == '__main__':
 
				     sample_offset = 0
			
 
				     
			
 
				     step = 160 * 10
			
 
				-    param_dict = {'in_cache': dict()}
			
 
				+    param_dict = {'in_cache': dict(), 'max_end_sil': 800}
			
 
				     for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
			
 
				         if sample_offset + step >= speech_length - 1:
			
 
				             step = speech_length - sample_offset
			
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
@@ -22,7 +22,7 @@ if __name__ == '__main__':
 
				     sample_offset = 0
			
 
				     
			
 
				     step = 80 * 10
			
 
				-    param_dict = {'in_cache': dict()}
			
 
				+    param_dict = {'in_cache': dict(), 'max_end_sil': 800}
			
 
				     for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
			
 
				         if sample_offset + step >= speech_length - 1:
			
 
				             step = speech_length - sample_offset
			
--- a/funasr/bin/vad_inference_online.py
+++ b/funasr/bin/vad_inference_online.py
@@ -1,6 +1,5 @@
 
				 import argparse
			
 
				 import logging
			
 
				-import os
			
 
				 import sys
			
 
				 import json
			
 
				 from pathlib import Path
			
@@ -30,7 +29,8 @@ from funasr.models.frontend.wav_frontend import WavFrontendOnline
 
				 from funasr.models.frontend.wav_frontend import WavFrontend
			
 
				 from funasr.bin.vad_inference import Speech2VadSegment
			
 
				 
			
 
				-
			
 
				+header_colors = '\033[95m'
			
 
				+end_colors = '\033[0m'
			
 
				 
			
 
				 
			
 
				 class Speech2VadSegmentOnline(Speech2VadSegment):
			
@@ -55,7 +55,7 @@ class Speech2VadSegmentOnline(Speech2VadSegment):
 
				     @torch.no_grad()
			
 
				     def __call__(
			
 
				             self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
			
 
				-            in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False
			
 
				+            in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
			
 
				     ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
			
 
				         """Inference
			
 
				 
			
@@ -86,7 +86,8 @@ class Speech2VadSegmentOnline(Speech2VadSegment):
 
				                 "feats": feats,
			
 
				                 "waveform": waveforms,
			
 
				                 "in_cache": in_cache,
			
 
				-                "is_final": is_final
			
 
				+                "is_final": is_final,
			
 
				+                "max_end_sil": max_end_sil
			
 
				             }
			
 
				             # a. To device
			
 
				             batch = to_device(batch, device=self.device)
			
@@ -217,6 +218,7 @@ def inference_modelscope(
 
				         vad_results = []
			
 
				         batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
			
 
				         is_final = param_dict['is_final'] if param_dict is not None else False
			
 
				+        max_end_sil = param_dict['max_end_sil'] if param_dict is not None else 800
			
 
				         for keys, batch in loader:
			
 
				             assert isinstance(batch, dict), type(batch)
			
 
				             assert all(isinstance(s, str) for s in keys), keys
			
@@ -224,6 +226,7 @@ def inference_modelscope(
 
				             assert len(keys) == _bs, f"{len(keys)} != {_bs}"
			
 
				             batch['in_cache'] = batch_in_cache
			
 
				             batch['is_final'] = is_final
			
 
				+            batch['max_end_sil'] = max_end_sil
			
 
				 
			
 
				             # do vad segment
			
 
				             _, results, param_dict['in_cache'] = speech2vadsegment(**batch)
			
--- a/funasr/models/e2e_vad.py
+++ b/funasr/models/e2e_vad.py
@@ -473,8 +473,9 @@ class E2EVadModel(nn.Module):
 
				         return segments, in_cache
			
 
				 
			
 
				     def forward_online(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
			
 
				-                is_final: bool = False
			
 
				+                is_final: bool = False, max_end_sil: int = 800
			
 
				                 ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
			
 
				+        self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
			
 
				         self.waveform = waveform  # compute decibel for each frame
			
 
				         self.ComputeDecibel()
			
 
				         self.ComputeScores(feats, in_cache)