| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717 |
- using System;
- using System.Collections.Generic;
- using System.Diagnostics;
- using System.Linq;
- using System.Text;
- using System.Threading.Tasks;
- using AliFsmnVadSharp.Model;
- namespace AliFsmnVadSharp
- {
- enum VadStateMachine
- {
- kVadInStateStartPointNotDetected = 1,
- kVadInStateInSpeechSegment = 2,
- kVadInStateEndPointDetected = 3,
- }
- enum VadDetectMode
- {
- kVadSingleUtteranceDetectMode = 0,
- kVadMutipleUtteranceDetectMode = 1,
- }
- internal class E2EVadModel
- {
- private VadPostConfEntity _vad_opts = new VadPostConfEntity();
- private WindowDetector _windows_detector = new WindowDetector();
- private bool _is_final = false;
- private int _data_buf_start_frame = 0;
- private int _frm_cnt = 0;
- private int _latest_confirmed_speech_frame = 0;
- private int _lastest_confirmed_silence_frame = -1;
- private int _continous_silence_frame_count = 0;
- private int _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
- private int _confirmed_start_frame = -1;
- private int _confirmed_end_frame = -1;
- private int _number_end_time_detected = 0;
- private int _sil_frame = 0;
- private int[] _sil_pdf_ids = new int[0];
- private double _noise_average_decibel = -100.0D;
- private bool _pre_end_silence_detected = false;
- private bool _next_seg = true;
- private List<E2EVadSpeechBufWithDoaEntity> _output_data_buf;
- private int _output_data_buf_offset = 0;
- private List<E2EVadFrameProbEntity> _frame_probs = new List<E2EVadFrameProbEntity>();
- private int _max_end_sil_frame_cnt_thresh = 800 - 150;
- private float _speech_noise_thres = 0.6F;
- private float[,,] _scores = null;
- private int _idx_pre_chunk = 0;
- private bool _max_time_out = false;
- private List<double> _decibel = new List<double>();
- private int _data_buf_size = 0;
- private int _data_buf_all_size = 0;
- public E2EVadModel(VadPostConfEntity vadPostConfEntity)
- {
- _vad_opts = vadPostConfEntity;
- _windows_detector = new WindowDetector(_vad_opts.window_size_ms,
- _vad_opts.sil_to_speech_time_thres,
- _vad_opts.speech_to_sil_time_thres,
- _vad_opts.frame_in_ms);
- AllResetDetection();
- }
- private void AllResetDetection()
- {
- _is_final = false;
- _data_buf_start_frame = 0;
- _frm_cnt = 0;
- _latest_confirmed_speech_frame = 0;
- _lastest_confirmed_silence_frame = -1;
- _continous_silence_frame_count = 0;
- _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
- _confirmed_start_frame = -1;
- _confirmed_end_frame = -1;
- _number_end_time_detected = 0;
- _sil_frame = 0;
- _sil_pdf_ids = _vad_opts.sil_pdf_ids;
- _noise_average_decibel = -100.0F;
- _pre_end_silence_detected = false;
- _next_seg = true;
- _output_data_buf = new List<E2EVadSpeechBufWithDoaEntity>();
- _output_data_buf_offset = 0;
- _frame_probs = new List<E2EVadFrameProbEntity>();
- _max_end_sil_frame_cnt_thresh = _vad_opts.max_end_silence_time - _vad_opts.speech_to_sil_time_thres;
- _speech_noise_thres = _vad_opts.speech_noise_thres;
- _scores = null;
- _idx_pre_chunk = 0;
- _max_time_out = false;
- _decibel = new List<double>();
- _data_buf_size = 0;
- _data_buf_all_size = 0;
- ResetDetection();
- }
- private void ResetDetection()
- {
- _continous_silence_frame_count = 0;
- _latest_confirmed_speech_frame = 0;
- _lastest_confirmed_silence_frame = -1;
- _confirmed_start_frame = -1;
- _confirmed_end_frame = -1;
- _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
- _windows_detector.Reset();
- _sil_frame = 0;
- _frame_probs = new List<E2EVadFrameProbEntity>();
- }
- private void ComputeDecibel(float[] waveform)
- {
- int frame_sample_length = (int)(_vad_opts.frame_length_ms * _vad_opts.sample_rate / 1000);
- int frame_shift_length = (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
- if (_data_buf_all_size == 0)
- {
- _data_buf_all_size = waveform.Length;
- _data_buf_size = _data_buf_all_size;
- }
- else
- {
- _data_buf_all_size += waveform.Length;
- }
- for (int offset = 0; offset < waveform.Length - frame_sample_length + 1; offset += frame_shift_length)
- {
- float[] _waveform_chunk = new float[frame_sample_length];
- Array.Copy(waveform, offset, _waveform_chunk, 0, _waveform_chunk.Length);
- float[] _waveform_chunk_pow = _waveform_chunk.Select(x => (float)Math.Pow((double)x, 2)).ToArray();
- _decibel.Add(
- 10 * Math.Log10(
- _waveform_chunk_pow.Sum() + 0.000001
- )
- );
- }
- }
- private void ComputeScores(float[,,] scores)
- {
- _vad_opts.nn_eval_block_size = scores.GetLength(1);
- _frm_cnt += scores.GetLength(1);
- _scores = scores;
- }
- private void PopDataBufTillFrame(int frame_idx)// need check again
- {
- while (_data_buf_start_frame < frame_idx)
- {
- if (_data_buf_size >= (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000))
- {
- _data_buf_start_frame += 1;
- _data_buf_size = _data_buf_all_size - _data_buf_start_frame * (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
- }
- }
- }
- private void PopDataToOutputBuf(int start_frm, int frm_cnt, bool first_frm_is_start_point,
- bool last_frm_is_end_point, bool end_point_is_sent_end)
- {
- PopDataBufTillFrame(start_frm);
- int expected_sample_number = (int)(frm_cnt * _vad_opts.sample_rate * _vad_opts.frame_in_ms / 1000);
- if (last_frm_is_end_point)
- {
- int extra_sample = Math.Max(0, (int)(_vad_opts.frame_length_ms * _vad_opts.sample_rate / 1000 - _vad_opts.sample_rate * _vad_opts.frame_in_ms / 1000));
- expected_sample_number += (int)(extra_sample);
- }
- if (end_point_is_sent_end)
- {
- expected_sample_number = Math.Max(expected_sample_number, _data_buf_size);
- }
- if (_data_buf_size < expected_sample_number)
- {
- Console.WriteLine("error in calling pop data_buf\n");
- }
- if (_output_data_buf.Count == 0 || first_frm_is_start_point)
- {
- _output_data_buf.Add(new E2EVadSpeechBufWithDoaEntity());
- _output_data_buf.Last().Reset();
- _output_data_buf.Last().start_ms = start_frm * _vad_opts.frame_in_ms;
- _output_data_buf.Last().end_ms = _output_data_buf.Last().start_ms;
- _output_data_buf.Last().doa = 0;
- }
- E2EVadSpeechBufWithDoaEntity cur_seg = _output_data_buf.Last();
- if (cur_seg.end_ms != start_frm * _vad_opts.frame_in_ms)
- {
- Console.WriteLine("warning\n");
- }
- int out_pos = cur_seg.buffer.Length; // cur_seg.buff现在没做任何操作
- int data_to_pop = 0;
- if (end_point_is_sent_end)
- {
- data_to_pop = expected_sample_number;
- }
- else
- {
- data_to_pop = (int)(frm_cnt * _vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
- }
- if (data_to_pop > _data_buf_size)
- {
- Console.WriteLine("VAD data_to_pop is bigger than _data_buf_size!!!\n");
- data_to_pop = _data_buf_size;
- expected_sample_number = _data_buf_size;
- }
- cur_seg.doa = 0;
- for (int sample_cpy_out = 0; sample_cpy_out < data_to_pop; sample_cpy_out++)
- {
- out_pos += 1;
- }
- for (int sample_cpy_out = data_to_pop; sample_cpy_out < expected_sample_number; sample_cpy_out++)
- {
- out_pos += 1;
- }
- if (cur_seg.end_ms != start_frm * _vad_opts.frame_in_ms)
- {
- Console.WriteLine("Something wrong with the VAD algorithm\n");
- }
- _data_buf_start_frame += frm_cnt;
- cur_seg.end_ms = (start_frm + frm_cnt) * _vad_opts.frame_in_ms;
- if (first_frm_is_start_point)
- {
- cur_seg.contain_seg_start_point = true;
- }
- if (last_frm_is_end_point)
- {
- cur_seg.contain_seg_end_point = true;
- }
- }
- private void OnSilenceDetected(int valid_frame)
- {
- _lastest_confirmed_silence_frame = valid_frame;
- if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
- {
- PopDataBufTillFrame(valid_frame);
- }
- }
- private void OnVoiceDetected(int valid_frame)
- {
- _latest_confirmed_speech_frame = valid_frame;
- PopDataToOutputBuf(valid_frame, 1, false, false, false);
- }
- private void OnVoiceStart(int start_frame, bool fake_result = false)
- {
- if (_vad_opts.do_start_point_detection)
- {
- //do nothing
- }
- if (_confirmed_start_frame != -1)
- {
- Console.WriteLine("not reset vad properly\n");
- }
- else
- {
- _confirmed_start_frame = start_frame;
- }
- if (!fake_result || _vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
- {
- PopDataToOutputBuf(_confirmed_start_frame, 1, true, false, false);
- }
- }
- private void OnVoiceEnd(int end_frame, bool fake_result, bool is_last_frame)
- {
- for (int t = _latest_confirmed_speech_frame + 1; t < end_frame; t++)
- {
- OnVoiceDetected(t);
- }
- if (_vad_opts.do_end_point_detection)
- {
- //do nothing
- }
- if (_confirmed_end_frame != -1)
- {
- Console.WriteLine("not reset vad properly\n");
- }
- else
- {
- _confirmed_end_frame = end_frame;
- }
- if (!fake_result)
- {
- _sil_frame = 0;
- PopDataToOutputBuf(_confirmed_end_frame, 1, false, true, is_last_frame);
- }
- _number_end_time_detected += 1;
- }
- private void MaybeOnVoiceEndIfLastFrame(bool is_final_frame, int cur_frm_idx)
- {
- if (is_final_frame)
- {
- OnVoiceEnd(cur_frm_idx, false, true);
- _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
- }
- }
- private int GetLatency()
- {
- return (int)(LatencyFrmNumAtStartPoint() * _vad_opts.frame_in_ms);
- }
- private int LatencyFrmNumAtStartPoint()
- {
- int vad_latency = _windows_detector.GetWinSize();
- if (_vad_opts.do_extend != 0)
- {
- vad_latency += (int)(_vad_opts.lookback_time_start_point / _vad_opts.frame_in_ms);
- }
- return vad_latency;
- }
- private FrameState GetFrameState(int t)
- {
- FrameState frame_state = FrameState.kFrameStateInvalid;
- double cur_decibel = _decibel[t];
- double cur_snr = cur_decibel - _noise_average_decibel;
- if (cur_decibel < _vad_opts.decibel_thres)
- {
- frame_state = FrameState.kFrameStateSil;
- DetectOneFrame(frame_state, t, false);
- return frame_state;
- }
- double sum_score = 0.0D;
- double noise_prob = 0.0D;
- Trace.Assert(_sil_pdf_ids.Length == _vad_opts.silence_pdf_num, "");
- if (_sil_pdf_ids.Length > 0)
- {
- Trace.Assert(_scores.GetLength(0) == 1, "只支持batch_size = 1的测试"); // 只支持batch_size = 1的测试
- float[] sil_pdf_scores = new float[_sil_pdf_ids.Length];
- int j = 0;
- foreach (int sil_pdf_id in _sil_pdf_ids)
- {
- sil_pdf_scores[j] = _scores[0,t - _idx_pre_chunk,sil_pdf_id];
- j++;
- }
- sum_score = sil_pdf_scores.Length == 0 ? 0 : sil_pdf_scores.Sum();
- noise_prob = Math.Log(sum_score) * _vad_opts.speech_2_noise_ratio;
- double total_score = 1.0D;
- sum_score = total_score - sum_score;
- }
- double speech_prob = Math.Log(sum_score);
- if (_vad_opts.output_frame_probs)
- {
- E2EVadFrameProbEntity frame_prob = new E2EVadFrameProbEntity();
- frame_prob.noise_prob = noise_prob;
- frame_prob.speech_prob = speech_prob;
- frame_prob.score = sum_score;
- frame_prob.frame_id = t;
- _frame_probs.Add(frame_prob);
- }
- if (Math.Exp(speech_prob) >= Math.Exp(noise_prob) + _speech_noise_thres)
- {
- if (cur_snr >= _vad_opts.snr_thres && cur_decibel >= _vad_opts.decibel_thres)
- {
- frame_state = FrameState.kFrameStateSpeech;
- }
- else
- {
- frame_state = FrameState.kFrameStateSil;
- }
- }
- else
- {
- frame_state = FrameState.kFrameStateSil;
- if (_noise_average_decibel < -99.9)
- {
- _noise_average_decibel = cur_decibel;
- }
- else
- {
- _noise_average_decibel = (cur_decibel + _noise_average_decibel * (_vad_opts.noise_frame_num_used_for_snr - 1)) / _vad_opts.noise_frame_num_used_for_snr;
- }
- }
- return frame_state;
- }
- public SegmentEntity[] DefaultCall(float[,,] score, float[] waveform,
- bool is_final = false, int max_end_sil = 800, bool online = false
- )
- {
- _max_end_sil_frame_cnt_thresh = max_end_sil - _vad_opts.speech_to_sil_time_thres;
- // compute decibel for each frame
- ComputeDecibel(waveform);
- ComputeScores(score);
- if (!is_final)
- {
- DetectCommonFrames();
- }
- else
- {
- DetectLastFrames();
- }
- int batchSize = score.GetLength(0);
- SegmentEntity[] segments = new SegmentEntity[batchSize];
- for (int batch_num = 0; batch_num < batchSize; batch_num++) // only support batch_size = 1 now
- {
- List<int[]> segment_batch = new List<int[]>();
- if (_output_data_buf.Count > 0)
- {
- for (int i = _output_data_buf_offset; i < _output_data_buf.Count; i++)
- {
- int start_ms;
- int end_ms;
- if (online)
- {
- if (!_output_data_buf[i].contain_seg_start_point)
- {
- continue;
- }
- if (!_next_seg && !_output_data_buf[i].contain_seg_end_point)
- {
- continue;
- }
- start_ms = _next_seg ? _output_data_buf[i].start_ms : -1;
- if (_output_data_buf[i].contain_seg_end_point)
- {
- end_ms = _output_data_buf[i].end_ms;
- _next_seg = true;
- _output_data_buf_offset += 1;
- }
- else
- {
- end_ms = -1;
- _next_seg = false;
- }
- }
- else
- {
- if (!is_final && (!_output_data_buf[i].contain_seg_start_point || !_output_data_buf[i].contain_seg_end_point))
- {
- continue;
- }
- start_ms = _output_data_buf[i].start_ms;
- end_ms = _output_data_buf[i].end_ms;
- _output_data_buf_offset += 1;
- }
- int[] segment_ms = new int[] { start_ms, end_ms };
- segment_batch.Add(segment_ms);
-
- }
- }
- if (segment_batch.Count > 0)
- {
- if (segments[batch_num] == null)
- {
- segments[batch_num] = new SegmentEntity();
- }
- segments[batch_num].Segment.AddRange(segment_batch);
- }
- }
- if (is_final)
- {
- // reset class variables and clear the dict for the next query
- AllResetDetection();
- }
- return segments;
- }
- private int DetectCommonFrames()
- {
- if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected)
- {
- return 0;
- }
- for (int i = _vad_opts.nn_eval_block_size - 1; i > -1; i += -1)
- {
- FrameState frame_state = FrameState.kFrameStateInvalid;
- frame_state = GetFrameState(_frm_cnt - 1 - i);
- DetectOneFrame(frame_state, _frm_cnt - 1 - i, false);
- }
- _idx_pre_chunk += _scores.GetLength(1)* _scores.GetLength(0); //_scores.shape[1];
- return 0;
- }
- private int DetectLastFrames()
- {
- if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected)
- {
- return 0;
- }
- for (int i = _vad_opts.nn_eval_block_size - 1; i > -1; i += -1)
- {
- FrameState frame_state = FrameState.kFrameStateInvalid;
- frame_state = GetFrameState(_frm_cnt - 1 - i);
- if (i != 0)
- {
- DetectOneFrame(frame_state, _frm_cnt - 1 - i, false);
- }
- else
- {
- DetectOneFrame(frame_state, _frm_cnt - 1, true);
- }
- }
- return 0;
- }
- private void DetectOneFrame(FrameState cur_frm_state, int cur_frm_idx, bool is_final_frame)
- {
- FrameState tmp_cur_frm_state = FrameState.kFrameStateInvalid;
- if (cur_frm_state == FrameState.kFrameStateSpeech)
- {
- if (Math.Abs(1.0) > _vad_opts.fe_prior_thres)//Fabs
- {
- tmp_cur_frm_state = FrameState.kFrameStateSpeech;
- }
- else
- {
- tmp_cur_frm_state = FrameState.kFrameStateSil;
- }
- }
- else if (cur_frm_state == FrameState.kFrameStateSil)
- {
- tmp_cur_frm_state = FrameState.kFrameStateSil;
- }
- AudioChangeState state_change = _windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx);
- int frm_shift_in_ms = _vad_opts.frame_in_ms;
- if (AudioChangeState.kChangeStateSil2Speech == state_change)
- {
- int silence_frame_count = _continous_silence_frame_count; // no used
- _continous_silence_frame_count = 0;
- _pre_end_silence_detected = false;
- int start_frame = 0;
- if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
- {
- start_frame = Math.Max(_data_buf_start_frame, cur_frm_idx - LatencyFrmNumAtStartPoint());
- OnVoiceStart(start_frame);
- _vad_state_machine = (int)VadStateMachine.kVadInStateInSpeechSegment;
- for (int t = start_frame + 1; t < cur_frm_idx + 1; t++)
- {
- OnVoiceDetected(t);
- }
- }
- else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
- {
- for (int t = _latest_confirmed_speech_frame + 1; t < cur_frm_idx; t++)
- {
- OnVoiceDetected(t);
- }
- if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
- {
- OnVoiceEnd(cur_frm_idx, false, false);
- _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
- }
- else if (!is_final_frame)
- {
- OnVoiceDetected(cur_frm_idx);
- }
- else
- {
- MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
- }
- }
- else
- {
- return;
- }
- }
- else if (AudioChangeState.kChangeStateSpeech2Sil == state_change)
- {
- _continous_silence_frame_count = 0;
- if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
- { return; }
- else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
- {
- if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
- {
- OnVoiceEnd(cur_frm_idx, false, false);
- _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
- }
- else if (!is_final_frame)
- {
- OnVoiceDetected(cur_frm_idx);
- }
- else
- {
- MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
- }
- }
- else
- {
- return;
- }
- }
- else if (AudioChangeState.kChangeStateSpeech2Speech == state_change)
- {
- _continous_silence_frame_count = 0;
- if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
- {
- if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
- {
- _max_time_out = true;
- OnVoiceEnd(cur_frm_idx, false, false);
- _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
- }
- else if (!is_final_frame)
- {
- OnVoiceDetected(cur_frm_idx);
- }
- else
- {
- MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
- }
- }
- else
- {
- return;
- }
- }
- else if (AudioChangeState.kChangeStateSil2Sil == state_change)
- {
- _continous_silence_frame_count += 1;
- if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
- {
- // silence timeout, return zero length decision
- if (((_vad_opts.detect_mode == (int)VadDetectMode.kVadSingleUtteranceDetectMode) && (
- _continous_silence_frame_count * frm_shift_in_ms > _vad_opts.max_start_silence_time)) || (is_final_frame && _number_end_time_detected == 0))
- {
- for (int t = _lastest_confirmed_silence_frame + 1; t < cur_frm_idx; t++)
- {
- OnSilenceDetected(t);
- }
- OnVoiceStart(0, true);
- OnVoiceEnd(0, true, false);
- _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
- }
- else
- {
- if (cur_frm_idx >= LatencyFrmNumAtStartPoint())
- {
- OnSilenceDetected(cur_frm_idx - LatencyFrmNumAtStartPoint());
- }
- }
- }
- else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
- {
- if (_continous_silence_frame_count * frm_shift_in_ms >= _max_end_sil_frame_cnt_thresh)
- {
- int lookback_frame = (int)(_max_end_sil_frame_cnt_thresh / frm_shift_in_ms);
- if (_vad_opts.do_extend != 0)
- {
- lookback_frame -= (int)(_vad_opts.lookahead_time_end_point / frm_shift_in_ms);
- lookback_frame -= 1;
- lookback_frame = Math.Max(0, lookback_frame);
- }
- OnVoiceEnd(cur_frm_idx - lookback_frame, false, false);
- _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
- }
- else if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
- {
- OnVoiceEnd(cur_frm_idx, false, false);
- _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
- }
- else if (_vad_opts.do_extend != 0 && !is_final_frame)
- {
- if (_continous_silence_frame_count <= (int)(_vad_opts.lookahead_time_end_point / frm_shift_in_ms))
- {
- OnVoiceDetected(cur_frm_idx);
- }
- }
- else
- {
- MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
- }
- }
- else
- {
- return;
- }
- }
- if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected && _vad_opts.detect_mode == (int)VadDetectMode.kVadMutipleUtteranceDetectMode)
- {
- ResetDetection();
- }
- }
- }
- }
|