E2EVadModel.cs 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Diagnostics;
  4. using System.Linq;
  5. using System.Text;
  6. using System.Threading.Tasks;
  7. using AliFsmnVadSharp.Model;
  8. namespace AliFsmnVadSharp
  9. {
  10. enum VadStateMachine
  11. {
  12. kVadInStateStartPointNotDetected = 1,
  13. kVadInStateInSpeechSegment = 2,
  14. kVadInStateEndPointDetected = 3,
  15. }
  16. enum VadDetectMode
  17. {
  18. kVadSingleUtteranceDetectMode = 0,
  19. kVadMutipleUtteranceDetectMode = 1,
  20. }
  21. internal class E2EVadModel
  22. {
  23. private VadPostConfEntity _vad_opts = new VadPostConfEntity();
  24. private WindowDetector _windows_detector = new WindowDetector();
  25. private bool _is_final = false;
  26. private int _data_buf_start_frame = 0;
  27. private int _frm_cnt = 0;
  28. private int _latest_confirmed_speech_frame = 0;
  29. private int _lastest_confirmed_silence_frame = -1;
  30. private int _continous_silence_frame_count = 0;
  31. private int _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
  32. private int _confirmed_start_frame = -1;
  33. private int _confirmed_end_frame = -1;
  34. private int _number_end_time_detected = 0;
  35. private int _sil_frame = 0;
  36. private int[] _sil_pdf_ids = new int[0];
  37. private double _noise_average_decibel = -100.0D;
  38. private bool _pre_end_silence_detected = false;
  39. private bool _next_seg = true;
  40. private List<E2EVadSpeechBufWithDoaEntity> _output_data_buf;
  41. private int _output_data_buf_offset = 0;
  42. private List<E2EVadFrameProbEntity> _frame_probs = new List<E2EVadFrameProbEntity>();
  43. private int _max_end_sil_frame_cnt_thresh = 800 - 150;
  44. private float _speech_noise_thres = 0.6F;
  45. private float[,,] _scores = null;
  46. private int _idx_pre_chunk = 0;
  47. private bool _max_time_out = false;
  48. private List<double> _decibel = new List<double>();
  49. private int _data_buf_size = 0;
  50. private int _data_buf_all_size = 0;
  51. public E2EVadModel(VadPostConfEntity vadPostConfEntity)
  52. {
  53. _vad_opts = vadPostConfEntity;
  54. _windows_detector = new WindowDetector(_vad_opts.window_size_ms,
  55. _vad_opts.sil_to_speech_time_thres,
  56. _vad_opts.speech_to_sil_time_thres,
  57. _vad_opts.frame_in_ms);
  58. AllResetDetection();
  59. }
  60. private void AllResetDetection()
  61. {
  62. _is_final = false;
  63. _data_buf_start_frame = 0;
  64. _frm_cnt = 0;
  65. _latest_confirmed_speech_frame = 0;
  66. _lastest_confirmed_silence_frame = -1;
  67. _continous_silence_frame_count = 0;
  68. _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
  69. _confirmed_start_frame = -1;
  70. _confirmed_end_frame = -1;
  71. _number_end_time_detected = 0;
  72. _sil_frame = 0;
  73. _sil_pdf_ids = _vad_opts.sil_pdf_ids;
  74. _noise_average_decibel = -100.0F;
  75. _pre_end_silence_detected = false;
  76. _next_seg = true;
  77. _output_data_buf = new List<E2EVadSpeechBufWithDoaEntity>();
  78. _output_data_buf_offset = 0;
  79. _frame_probs = new List<E2EVadFrameProbEntity>();
  80. _max_end_sil_frame_cnt_thresh = _vad_opts.max_end_silence_time - _vad_opts.speech_to_sil_time_thres;
  81. _speech_noise_thres = _vad_opts.speech_noise_thres;
  82. _scores = null;
  83. _idx_pre_chunk = 0;
  84. _max_time_out = false;
  85. _decibel = new List<double>();
  86. _data_buf_size = 0;
  87. _data_buf_all_size = 0;
  88. ResetDetection();
  89. }
  90. private void ResetDetection()
  91. {
  92. _continous_silence_frame_count = 0;
  93. _latest_confirmed_speech_frame = 0;
  94. _lastest_confirmed_silence_frame = -1;
  95. _confirmed_start_frame = -1;
  96. _confirmed_end_frame = -1;
  97. _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
  98. _windows_detector.Reset();
  99. _sil_frame = 0;
  100. _frame_probs = new List<E2EVadFrameProbEntity>();
  101. }
  102. private void ComputeDecibel(float[] waveform)
  103. {
  104. int frame_sample_length = (int)(_vad_opts.frame_length_ms * _vad_opts.sample_rate / 1000);
  105. int frame_shift_length = (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
  106. if (_data_buf_all_size == 0)
  107. {
  108. _data_buf_all_size = waveform.Length;
  109. _data_buf_size = _data_buf_all_size;
  110. }
  111. else
  112. {
  113. _data_buf_all_size += waveform.Length;
  114. }
  115. for (int offset = 0; offset < waveform.Length - frame_sample_length + 1; offset += frame_shift_length)
  116. {
  117. float[] _waveform_chunk = new float[frame_sample_length];
  118. Array.Copy(waveform, offset, _waveform_chunk, 0, _waveform_chunk.Length);
  119. float[] _waveform_chunk_pow = _waveform_chunk.Select(x => (float)Math.Pow((double)x, 2)).ToArray();
  120. _decibel.Add(
  121. 10 * Math.Log10(
  122. _waveform_chunk_pow.Sum() + 0.000001
  123. )
  124. );
  125. }
  126. }
  127. private void ComputeScores(float[,,] scores)
  128. {
  129. _vad_opts.nn_eval_block_size = scores.GetLength(1);
  130. _frm_cnt += scores.GetLength(1);
  131. _scores = scores;
  132. }
  133. private void PopDataBufTillFrame(int frame_idx)// need check again
  134. {
  135. while (_data_buf_start_frame < frame_idx)
  136. {
  137. if (_data_buf_size >= (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000))
  138. {
  139. _data_buf_start_frame += 1;
  140. _data_buf_size = _data_buf_all_size - _data_buf_start_frame * (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
  141. }
  142. }
  143. }
  144. private void PopDataToOutputBuf(int start_frm, int frm_cnt, bool first_frm_is_start_point,
  145. bool last_frm_is_end_point, bool end_point_is_sent_end)
  146. {
  147. PopDataBufTillFrame(start_frm);
  148. int expected_sample_number = (int)(frm_cnt * _vad_opts.sample_rate * _vad_opts.frame_in_ms / 1000);
  149. if (last_frm_is_end_point)
  150. {
  151. int extra_sample = Math.Max(0, (int)(_vad_opts.frame_length_ms * _vad_opts.sample_rate / 1000 - _vad_opts.sample_rate * _vad_opts.frame_in_ms / 1000));
  152. expected_sample_number += (int)(extra_sample);
  153. }
  154. if (end_point_is_sent_end)
  155. {
  156. expected_sample_number = Math.Max(expected_sample_number, _data_buf_size);
  157. }
  158. if (_data_buf_size < expected_sample_number)
  159. {
  160. Console.WriteLine("error in calling pop data_buf\n");
  161. }
  162. if (_output_data_buf.Count == 0 || first_frm_is_start_point)
  163. {
  164. _output_data_buf.Add(new E2EVadSpeechBufWithDoaEntity());
  165. _output_data_buf.Last().Reset();
  166. _output_data_buf.Last().start_ms = start_frm * _vad_opts.frame_in_ms;
  167. _output_data_buf.Last().end_ms = _output_data_buf.Last().start_ms;
  168. _output_data_buf.Last().doa = 0;
  169. }
  170. E2EVadSpeechBufWithDoaEntity cur_seg = _output_data_buf.Last();
  171. if (cur_seg.end_ms != start_frm * _vad_opts.frame_in_ms)
  172. {
  173. Console.WriteLine("warning\n");
  174. }
  175. int out_pos = cur_seg.buffer.Length; // cur_seg.buff现在没做任何操作
  176. int data_to_pop = 0;
  177. if (end_point_is_sent_end)
  178. {
  179. data_to_pop = expected_sample_number;
  180. }
  181. else
  182. {
  183. data_to_pop = (int)(frm_cnt * _vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
  184. }
  185. if (data_to_pop > _data_buf_size)
  186. {
  187. Console.WriteLine("VAD data_to_pop is bigger than _data_buf_size!!!\n");
  188. data_to_pop = _data_buf_size;
  189. expected_sample_number = _data_buf_size;
  190. }
  191. cur_seg.doa = 0;
  192. for (int sample_cpy_out = 0; sample_cpy_out < data_to_pop; sample_cpy_out++)
  193. {
  194. out_pos += 1;
  195. }
  196. for (int sample_cpy_out = data_to_pop; sample_cpy_out < expected_sample_number; sample_cpy_out++)
  197. {
  198. out_pos += 1;
  199. }
  200. if (cur_seg.end_ms != start_frm * _vad_opts.frame_in_ms)
  201. {
  202. Console.WriteLine("Something wrong with the VAD algorithm\n");
  203. }
  204. _data_buf_start_frame += frm_cnt;
  205. cur_seg.end_ms = (start_frm + frm_cnt) * _vad_opts.frame_in_ms;
  206. if (first_frm_is_start_point)
  207. {
  208. cur_seg.contain_seg_start_point = true;
  209. }
  210. if (last_frm_is_end_point)
  211. {
  212. cur_seg.contain_seg_end_point = true;
  213. }
  214. }
  215. private void OnSilenceDetected(int valid_frame)
  216. {
  217. _lastest_confirmed_silence_frame = valid_frame;
  218. if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
  219. {
  220. PopDataBufTillFrame(valid_frame);
  221. }
  222. }
  223. private void OnVoiceDetected(int valid_frame)
  224. {
  225. _latest_confirmed_speech_frame = valid_frame;
  226. PopDataToOutputBuf(valid_frame, 1, false, false, false);
  227. }
  228. private void OnVoiceStart(int start_frame, bool fake_result = false)
  229. {
  230. if (_vad_opts.do_start_point_detection)
  231. {
  232. //do nothing
  233. }
  234. if (_confirmed_start_frame != -1)
  235. {
  236. Console.WriteLine("not reset vad properly\n");
  237. }
  238. else
  239. {
  240. _confirmed_start_frame = start_frame;
  241. }
  242. if (!fake_result || _vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
  243. {
  244. PopDataToOutputBuf(_confirmed_start_frame, 1, true, false, false);
  245. }
  246. }
  247. private void OnVoiceEnd(int end_frame, bool fake_result, bool is_last_frame)
  248. {
  249. for (int t = _latest_confirmed_speech_frame + 1; t < end_frame; t++)
  250. {
  251. OnVoiceDetected(t);
  252. }
  253. if (_vad_opts.do_end_point_detection)
  254. {
  255. //do nothing
  256. }
  257. if (_confirmed_end_frame != -1)
  258. {
  259. Console.WriteLine("not reset vad properly\n");
  260. }
  261. else
  262. {
  263. _confirmed_end_frame = end_frame;
  264. }
  265. if (!fake_result)
  266. {
  267. _sil_frame = 0;
  268. PopDataToOutputBuf(_confirmed_end_frame, 1, false, true, is_last_frame);
  269. }
  270. _number_end_time_detected += 1;
  271. }
  272. private void MaybeOnVoiceEndIfLastFrame(bool is_final_frame, int cur_frm_idx)
  273. {
  274. if (is_final_frame)
  275. {
  276. OnVoiceEnd(cur_frm_idx, false, true);
  277. _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
  278. }
  279. }
  280. private int GetLatency()
  281. {
  282. return (int)(LatencyFrmNumAtStartPoint() * _vad_opts.frame_in_ms);
  283. }
  284. private int LatencyFrmNumAtStartPoint()
  285. {
  286. int vad_latency = _windows_detector.GetWinSize();
  287. if (_vad_opts.do_extend != 0)
  288. {
  289. vad_latency += (int)(_vad_opts.lookback_time_start_point / _vad_opts.frame_in_ms);
  290. }
  291. return vad_latency;
  292. }
  293. private FrameState GetFrameState(int t)
  294. {
  295. FrameState frame_state = FrameState.kFrameStateInvalid;
  296. double cur_decibel = _decibel[t];
  297. double cur_snr = cur_decibel - _noise_average_decibel;
  298. if (cur_decibel < _vad_opts.decibel_thres)
  299. {
  300. frame_state = FrameState.kFrameStateSil;
  301. DetectOneFrame(frame_state, t, false);
  302. return frame_state;
  303. }
  304. double sum_score = 0.0D;
  305. double noise_prob = 0.0D;
  306. Trace.Assert(_sil_pdf_ids.Length == _vad_opts.silence_pdf_num, "");
  307. if (_sil_pdf_ids.Length > 0)
  308. {
  309. Trace.Assert(_scores.GetLength(0) == 1, "只支持batch_size = 1的测试"); // 只支持batch_size = 1的测试
  310. float[] sil_pdf_scores = new float[_sil_pdf_ids.Length];
  311. int j = 0;
  312. foreach (int sil_pdf_id in _sil_pdf_ids)
  313. {
  314. sil_pdf_scores[j] = _scores[0,t - _idx_pre_chunk,sil_pdf_id];
  315. j++;
  316. }
  317. sum_score = sil_pdf_scores.Length == 0 ? 0 : sil_pdf_scores.Sum();
  318. noise_prob = Math.Log(sum_score) * _vad_opts.speech_2_noise_ratio;
  319. double total_score = 1.0D;
  320. sum_score = total_score - sum_score;
  321. }
  322. double speech_prob = Math.Log(sum_score);
  323. if (_vad_opts.output_frame_probs)
  324. {
  325. E2EVadFrameProbEntity frame_prob = new E2EVadFrameProbEntity();
  326. frame_prob.noise_prob = noise_prob;
  327. frame_prob.speech_prob = speech_prob;
  328. frame_prob.score = sum_score;
  329. frame_prob.frame_id = t;
  330. _frame_probs.Add(frame_prob);
  331. }
  332. if (Math.Exp(speech_prob) >= Math.Exp(noise_prob) + _speech_noise_thres)
  333. {
  334. if (cur_snr >= _vad_opts.snr_thres && cur_decibel >= _vad_opts.decibel_thres)
  335. {
  336. frame_state = FrameState.kFrameStateSpeech;
  337. }
  338. else
  339. {
  340. frame_state = FrameState.kFrameStateSil;
  341. }
  342. }
  343. else
  344. {
  345. frame_state = FrameState.kFrameStateSil;
  346. if (_noise_average_decibel < -99.9)
  347. {
  348. _noise_average_decibel = cur_decibel;
  349. }
  350. else
  351. {
  352. _noise_average_decibel = (cur_decibel + _noise_average_decibel * (_vad_opts.noise_frame_num_used_for_snr - 1)) / _vad_opts.noise_frame_num_used_for_snr;
  353. }
  354. }
  355. return frame_state;
  356. }
  357. public SegmentEntity[] DefaultCall(float[,,] score, float[] waveform,
  358. bool is_final = false, int max_end_sil = 800, bool online = false
  359. )
  360. {
  361. _max_end_sil_frame_cnt_thresh = max_end_sil - _vad_opts.speech_to_sil_time_thres;
  362. // compute decibel for each frame
  363. ComputeDecibel(waveform);
  364. ComputeScores(score);
  365. if (!is_final)
  366. {
  367. DetectCommonFrames();
  368. }
  369. else
  370. {
  371. DetectLastFrames();
  372. }
  373. int batchSize = score.GetLength(0);
  374. SegmentEntity[] segments = new SegmentEntity[batchSize];
  375. for (int batch_num = 0; batch_num < batchSize; batch_num++) // only support batch_size = 1 now
  376. {
  377. List<int[]> segment_batch = new List<int[]>();
  378. if (_output_data_buf.Count > 0)
  379. {
  380. for (int i = _output_data_buf_offset; i < _output_data_buf.Count; i++)
  381. {
  382. int start_ms;
  383. int end_ms;
  384. if (online)
  385. {
  386. if (!_output_data_buf[i].contain_seg_start_point)
  387. {
  388. continue;
  389. }
  390. if (!_next_seg && !_output_data_buf[i].contain_seg_end_point)
  391. {
  392. continue;
  393. }
  394. start_ms = _next_seg ? _output_data_buf[i].start_ms : -1;
  395. if (_output_data_buf[i].contain_seg_end_point)
  396. {
  397. end_ms = _output_data_buf[i].end_ms;
  398. _next_seg = true;
  399. _output_data_buf_offset += 1;
  400. }
  401. else
  402. {
  403. end_ms = -1;
  404. _next_seg = false;
  405. }
  406. }
  407. else
  408. {
  409. if (!is_final && (!_output_data_buf[i].contain_seg_start_point || !_output_data_buf[i].contain_seg_end_point))
  410. {
  411. continue;
  412. }
  413. start_ms = _output_data_buf[i].start_ms;
  414. end_ms = _output_data_buf[i].end_ms;
  415. _output_data_buf_offset += 1;
  416. }
  417. int[] segment_ms = new int[] { start_ms, end_ms };
  418. segment_batch.Add(segment_ms);
  419. }
  420. }
  421. if (segment_batch.Count > 0)
  422. {
  423. if (segments[batch_num] == null)
  424. {
  425. segments[batch_num] = new SegmentEntity();
  426. }
  427. segments[batch_num].Segment.AddRange(segment_batch);
  428. }
  429. }
  430. if (is_final)
  431. {
  432. // reset class variables and clear the dict for the next query
  433. AllResetDetection();
  434. }
  435. return segments;
  436. }
  437. private int DetectCommonFrames()
  438. {
  439. if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected)
  440. {
  441. return 0;
  442. }
  443. for (int i = _vad_opts.nn_eval_block_size - 1; i > -1; i += -1)
  444. {
  445. FrameState frame_state = FrameState.kFrameStateInvalid;
  446. frame_state = GetFrameState(_frm_cnt - 1 - i);
  447. DetectOneFrame(frame_state, _frm_cnt - 1 - i, false);
  448. }
  449. _idx_pre_chunk += _scores.GetLength(1)* _scores.GetLength(0); //_scores.shape[1];
  450. return 0;
  451. }
  452. private int DetectLastFrames()
  453. {
  454. if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected)
  455. {
  456. return 0;
  457. }
  458. for (int i = _vad_opts.nn_eval_block_size - 1; i > -1; i += -1)
  459. {
  460. FrameState frame_state = FrameState.kFrameStateInvalid;
  461. frame_state = GetFrameState(_frm_cnt - 1 - i);
  462. if (i != 0)
  463. {
  464. DetectOneFrame(frame_state, _frm_cnt - 1 - i, false);
  465. }
  466. else
  467. {
  468. DetectOneFrame(frame_state, _frm_cnt - 1, true);
  469. }
  470. }
  471. return 0;
  472. }
  473. private void DetectOneFrame(FrameState cur_frm_state, int cur_frm_idx, bool is_final_frame)
  474. {
  475. FrameState tmp_cur_frm_state = FrameState.kFrameStateInvalid;
  476. if (cur_frm_state == FrameState.kFrameStateSpeech)
  477. {
  478. if (Math.Abs(1.0) > _vad_opts.fe_prior_thres)//Fabs
  479. {
  480. tmp_cur_frm_state = FrameState.kFrameStateSpeech;
  481. }
  482. else
  483. {
  484. tmp_cur_frm_state = FrameState.kFrameStateSil;
  485. }
  486. }
  487. else if (cur_frm_state == FrameState.kFrameStateSil)
  488. {
  489. tmp_cur_frm_state = FrameState.kFrameStateSil;
  490. }
  491. AudioChangeState state_change = _windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx);
  492. int frm_shift_in_ms = _vad_opts.frame_in_ms;
  493. if (AudioChangeState.kChangeStateSil2Speech == state_change)
  494. {
  495. int silence_frame_count = _continous_silence_frame_count; // no used
  496. _continous_silence_frame_count = 0;
  497. _pre_end_silence_detected = false;
  498. int start_frame = 0;
  499. if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
  500. {
  501. start_frame = Math.Max(_data_buf_start_frame, cur_frm_idx - LatencyFrmNumAtStartPoint());
  502. OnVoiceStart(start_frame);
  503. _vad_state_machine = (int)VadStateMachine.kVadInStateInSpeechSegment;
  504. for (int t = start_frame + 1; t < cur_frm_idx + 1; t++)
  505. {
  506. OnVoiceDetected(t);
  507. }
  508. }
  509. else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
  510. {
  511. for (int t = _latest_confirmed_speech_frame + 1; t < cur_frm_idx; t++)
  512. {
  513. OnVoiceDetected(t);
  514. }
  515. if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
  516. {
  517. OnVoiceEnd(cur_frm_idx, false, false);
  518. _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
  519. }
  520. else if (!is_final_frame)
  521. {
  522. OnVoiceDetected(cur_frm_idx);
  523. }
  524. else
  525. {
  526. MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
  527. }
  528. }
  529. else
  530. {
  531. return;
  532. }
  533. }
  534. else if (AudioChangeState.kChangeStateSpeech2Sil == state_change)
  535. {
  536. _continous_silence_frame_count = 0;
  537. if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
  538. { return; }
  539. else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
  540. {
  541. if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
  542. {
  543. OnVoiceEnd(cur_frm_idx, false, false);
  544. _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
  545. }
  546. else if (!is_final_frame)
  547. {
  548. OnVoiceDetected(cur_frm_idx);
  549. }
  550. else
  551. {
  552. MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
  553. }
  554. }
  555. else
  556. {
  557. return;
  558. }
  559. }
  560. else if (AudioChangeState.kChangeStateSpeech2Speech == state_change)
  561. {
  562. _continous_silence_frame_count = 0;
  563. if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
  564. {
  565. if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
  566. {
  567. _max_time_out = true;
  568. OnVoiceEnd(cur_frm_idx, false, false);
  569. _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
  570. }
  571. else if (!is_final_frame)
  572. {
  573. OnVoiceDetected(cur_frm_idx);
  574. }
  575. else
  576. {
  577. MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
  578. }
  579. }
  580. else
  581. {
  582. return;
  583. }
  584. }
  585. else if (AudioChangeState.kChangeStateSil2Sil == state_change)
  586. {
  587. _continous_silence_frame_count += 1;
  588. if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
  589. {
  590. // silence timeout, return zero length decision
  591. if (((_vad_opts.detect_mode == (int)VadDetectMode.kVadSingleUtteranceDetectMode) && (
  592. _continous_silence_frame_count * frm_shift_in_ms > _vad_opts.max_start_silence_time)) || (is_final_frame && _number_end_time_detected == 0))
  593. {
  594. for (int t = _lastest_confirmed_silence_frame + 1; t < cur_frm_idx; t++)
  595. {
  596. OnSilenceDetected(t);
  597. }
  598. OnVoiceStart(0, true);
  599. OnVoiceEnd(0, true, false);
  600. _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
  601. }
  602. else
  603. {
  604. if (cur_frm_idx >= LatencyFrmNumAtStartPoint())
  605. {
  606. OnSilenceDetected(cur_frm_idx - LatencyFrmNumAtStartPoint());
  607. }
  608. }
  609. }
  610. else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
  611. {
  612. if (_continous_silence_frame_count * frm_shift_in_ms >= _max_end_sil_frame_cnt_thresh)
  613. {
  614. int lookback_frame = (int)(_max_end_sil_frame_cnt_thresh / frm_shift_in_ms);
  615. if (_vad_opts.do_extend != 0)
  616. {
  617. lookback_frame -= (int)(_vad_opts.lookahead_time_end_point / frm_shift_in_ms);
  618. lookback_frame -= 1;
  619. lookback_frame = Math.Max(0, lookback_frame);
  620. }
  621. OnVoiceEnd(cur_frm_idx - lookback_frame, false, false);
  622. _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
  623. }
  624. else if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
  625. {
  626. OnVoiceEnd(cur_frm_idx, false, false);
  627. _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
  628. }
  629. else if (_vad_opts.do_extend != 0 && !is_final_frame)
  630. {
  631. if (_continous_silence_frame_count <= (int)(_vad_opts.lookahead_time_end_point / frm_shift_in_ms))
  632. {
  633. OnVoiceDetected(cur_frm_idx);
  634. }
  635. }
  636. else
  637. {
  638. MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
  639. }
  640. }
  641. else
  642. {
  643. return;
  644. }
  645. }
  646. if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected && _vad_opts.detect_mode == (int)VadDetectMode.kVadMutipleUtteranceDetectMode)
  647. {
  648. ResetDetection();
  649. }
  650. }
  651. }
  652. }