|
|
@@ -132,40 +132,54 @@ class AudioWindow {
|
|
|
};
|
|
|
};
|
|
|
|
|
|
-AudioFrame::AudioFrame(){};
|
|
|
+AudioFrame::AudioFrame(){}
|
|
|
AudioFrame::AudioFrame(int len) : len(len)
|
|
|
{
|
|
|
start = 0;
|
|
|
-};
|
|
|
-AudioFrame::~AudioFrame(){};
|
|
|
+}
|
|
|
+AudioFrame::AudioFrame(const AudioFrame &other)
|
|
|
+{
|
|
|
+ start = other.start;
|
|
|
+ end = other.end;
|
|
|
+ len = other.len;
|
|
|
+ is_final = other.is_final;
|
|
|
+}
|
|
|
+AudioFrame::AudioFrame(int start, int end, bool is_final):start(start),end(end),is_final(is_final){
|
|
|
+ len = end - start;
|
|
|
+}
|
|
|
+AudioFrame::~AudioFrame(){
|
|
|
+ if(data != NULL){
|
|
|
+ free(data);
|
|
|
+ }
|
|
|
+}
|
|
|
int AudioFrame::SetStart(int val)
|
|
|
{
|
|
|
start = val < 0 ? 0 : val;
|
|
|
return start;
|
|
|
-};
|
|
|
+}
|
|
|
|
|
|
int AudioFrame::SetEnd(int val)
|
|
|
{
|
|
|
end = val;
|
|
|
len = end - start;
|
|
|
return end;
|
|
|
-};
|
|
|
+}
|
|
|
|
|
|
int AudioFrame::GetStart()
|
|
|
{
|
|
|
return start;
|
|
|
-};
|
|
|
+}
|
|
|
|
|
|
int AudioFrame::GetLen()
|
|
|
{
|
|
|
return len;
|
|
|
-};
|
|
|
+}
|
|
|
|
|
|
int AudioFrame::Disp()
|
|
|
{
|
|
|
LOG(ERROR) << "Not imp!!!!";
|
|
|
return 0;
|
|
|
-};
|
|
|
+}
|
|
|
|
|
|
Audio::Audio(int data_type) : data_type(data_type)
|
|
|
{
|
|
|
@@ -771,6 +785,55 @@ bool Audio::LoadPcmwav(const char* buf, int n_buf_len, int32_t* sampling_rate)
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+bool Audio::LoadPcmwavOnline(const char* buf, int n_buf_len, int32_t* sampling_rate)
|
|
|
+{
|
|
|
+ if (speech_data != NULL) {
|
|
|
+ free(speech_data);
|
|
|
+ }
|
|
|
+ if (speech_buff != NULL) {
|
|
|
+ free(speech_buff);
|
|
|
+ }
|
|
|
+ if (speech_char != NULL) {
|
|
|
+ free(speech_char);
|
|
|
+ }
|
|
|
+
|
|
|
+ speech_len = n_buf_len / 2;
|
|
|
+ speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
|
|
|
+ if (speech_buff)
|
|
|
+ {
|
|
|
+ memset(speech_buff, 0, sizeof(int16_t) * speech_len);
|
|
|
+ memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t));
|
|
|
+
|
|
|
+ speech_data = (float*)malloc(sizeof(float) * speech_len);
|
|
|
+ memset(speech_data, 0, sizeof(float) * speech_len);
|
|
|
+
|
|
|
+ float scale = 1;
|
|
|
+ if (data_type == 1) {
|
|
|
+ scale = 32768;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int32_t i = 0; i != speech_len; ++i) {
|
|
|
+ speech_data[i] = (float)speech_buff[i] / scale;
|
|
|
+ }
|
|
|
+
|
|
|
+ //resample
|
|
|
+ if(*sampling_rate != MODEL_SAMPLE_RATE){
|
|
|
+ WavResample(*sampling_rate, speech_data, speech_len);
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int32_t i = 0; i != speech_len; ++i) {
|
|
|
+ all_samples.emplace_back(speech_data[i]);
|
|
|
+ }
|
|
|
+
|
|
|
+ AudioFrame* frame = new AudioFrame(speech_len);
|
|
|
+ frame_queue.push(frame);
|
|
|
+ return true;
|
|
|
+
|
|
|
+ }
|
|
|
+ else
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate)
|
|
|
{
|
|
|
if (speech_data != NULL) {
|
|
|
@@ -879,24 +942,25 @@ bool Audio::LoadOthers2Char(const char* filename)
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
-int Audio::FetchChunck(float *&dout, int len)
|
|
|
+int Audio::FetchTpass(AudioFrame *&frame)
|
|
|
{
|
|
|
- if (offset >= speech_align_len) {
|
|
|
- dout = NULL;
|
|
|
- return S_ERR;
|
|
|
- } else if (offset == speech_align_len - len) {
|
|
|
- dout = speech_data + offset;
|
|
|
- offset = speech_align_len;
|
|
|
- // 临时解决
|
|
|
- AudioFrame *frame = frame_queue.front();
|
|
|
- frame_queue.pop();
|
|
|
- delete frame;
|
|
|
+ if (asr_offline_queue.size() > 0) {
|
|
|
+ frame = asr_offline_queue.front();
|
|
|
+ asr_offline_queue.pop();
|
|
|
+ return 1;
|
|
|
+ } else {
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+}
|
|
|
|
|
|
- return S_END;
|
|
|
+int Audio::FetchChunck(AudioFrame *&frame)
|
|
|
+{
|
|
|
+ if (asr_online_queue.size() > 0) {
|
|
|
+ frame = asr_online_queue.front();
|
|
|
+ asr_online_queue.pop();
|
|
|
+ return 1;
|
|
|
} else {
|
|
|
- dout = speech_data + offset;
|
|
|
- offset += len;
|
|
|
- return S_MIDDLE;
|
|
|
+ return 0;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -965,7 +1029,6 @@ void Audio::Split(OfflineStream* offline_stream)
|
|
|
|
|
|
std::vector<float> pcm_data(speech_data, speech_data+sp_len);
|
|
|
vector<std::vector<int>> vad_segments = (offline_stream->vad_handle)->Infer(pcm_data);
|
|
|
- int seg_sample = MODEL_SAMPLE_RATE/1000;
|
|
|
for(vector<int> segment:vad_segments)
|
|
|
{
|
|
|
frame = new AudioFrame();
|
|
|
@@ -978,7 +1041,6 @@ void Audio::Split(OfflineStream* offline_stream)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-
|
|
|
void Audio::Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments, bool input_finished)
|
|
|
{
|
|
|
AudioFrame *frame;
|
|
|
@@ -993,4 +1055,171 @@ void Audio::Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments, boo
|
|
|
vad_segments = vad_obj->Infer(pcm_data, input_finished);
|
|
|
}
|
|
|
|
|
|
+// 2pass
|
|
|
+void Audio::Split(VadModel* vad_obj, bool input_finished)
|
|
|
+{
|
|
|
+ AudioFrame *frame;
|
|
|
+
|
|
|
+ frame = frame_queue.front();
|
|
|
+ frame_queue.pop();
|
|
|
+ int sp_len = frame->GetLen();
|
|
|
+ delete frame;
|
|
|
+ frame = NULL;
|
|
|
+
|
|
|
+ std::vector<float> pcm_data(speech_data, speech_data+sp_len);
|
|
|
+ vector<std::vector<int>> vad_segments = vad_obj->Infer(pcm_data, input_finished);
|
|
|
+
|
|
|
+ // print vad_segments
|
|
|
+ // string seg_out="[";
|
|
|
+ // for (int i = 0; i < vad_segments.size(); i++) {
|
|
|
+ // vector<int> inner_vec = vad_segments[i];
|
|
|
+ // if(inner_vec.size() == 0){
|
|
|
+ // continue;
|
|
|
+ // }
|
|
|
+ // seg_out += "[";
|
|
|
+ // for (int j = 0; j < inner_vec.size(); j++) {
|
|
|
+ // seg_out += to_string(inner_vec[j]);
|
|
|
+ // if (j != inner_vec.size() - 1) {
|
|
|
+ // seg_out += ",";
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // seg_out += "]";
|
|
|
+ // if (i != vad_segments.size() - 1) {
|
|
|
+ // seg_out += ",";
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // seg_out += "]";
|
|
|
+ // LOG(INFO)<<seg_out;
|
|
|
+
|
|
|
+ speech_end += sp_len/seg_sample;
|
|
|
+ if(vad_segments.size() == 0){
|
|
|
+ if(speech_start != -1){
|
|
|
+ int start = speech_start*seg_sample;
|
|
|
+ int end = speech_end*seg_sample;
|
|
|
+ int buff_len = end-start;
|
|
|
+ int step = ONLINE_STEP;
|
|
|
+
|
|
|
+ if(buff_len >= step){
|
|
|
+ frame = new AudioFrame(step);
|
|
|
+ frame->data = (float*)malloc(sizeof(float) * step);
|
|
|
+ memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
|
|
|
+ asr_online_queue.push(frame);
|
|
|
+ frame = NULL;
|
|
|
+ speech_start += step/seg_sample;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }else{
|
|
|
+ for(auto vad_segment: vad_segments){
|
|
|
+ int speech_start_i=-1, speech_end_i=-1;
|
|
|
+ if(vad_segment[0] != -1){
|
|
|
+ speech_start_i = vad_segment[0];
|
|
|
+ }
|
|
|
+ if(vad_segment[1] != -1){
|
|
|
+ speech_end_i = vad_segment[1];
|
|
|
+ }
|
|
|
+
|
|
|
+ // [1, 100]
|
|
|
+ if(speech_start_i != -1 && speech_end_i != -1){
|
|
|
+ int start = speech_start_i*seg_sample;
|
|
|
+ int end = speech_end_i*seg_sample;
|
|
|
+
|
|
|
+ frame = new AudioFrame(end-start);
|
|
|
+ frame->is_final = true;
|
|
|
+ frame->data = (float*)malloc(sizeof(float) * (end-start));
|
|
|
+ memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
|
|
|
+ asr_online_queue.push(frame);
|
|
|
+ frame = NULL;
|
|
|
+
|
|
|
+ frame = new AudioFrame(end-start);
|
|
|
+ frame->is_final = true;
|
|
|
+ frame->data = (float*)malloc(sizeof(float) * (end-start));
|
|
|
+ memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
|
|
|
+ asr_offline_queue.push(frame);
|
|
|
+ frame = NULL;
|
|
|
+
|
|
|
+ speech_start = -1;
|
|
|
+ speech_offline_start = -1;
|
|
|
+ // [70, -1]
|
|
|
+ }else if(speech_start_i != -1){
|
|
|
+ speech_start = speech_start_i;
|
|
|
+ speech_offline_start = speech_start_i;
|
|
|
+
|
|
|
+ int start = speech_start*seg_sample;
|
|
|
+ int end = speech_end*seg_sample;
|
|
|
+ int buff_len = end-start;
|
|
|
+ int step = ONLINE_STEP;
|
|
|
+
|
|
|
+ if(buff_len >= step){
|
|
|
+ frame = new AudioFrame(step);
|
|
|
+ frame->data = (float*)malloc(sizeof(float) * step);
|
|
|
+ memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
|
|
|
+ asr_online_queue.push(frame);
|
|
|
+ frame = NULL;
|
|
|
+ speech_start += step/seg_sample;
|
|
|
+ }
|
|
|
+
|
|
|
+ }else if(speech_end_i != -1){ // [-1,100]
|
|
|
+ if(speech_start == -1 or speech_offline_start == -1){
|
|
|
+ LOG(ERROR) <<"Vad start is null while vad end is available." ;
|
|
|
+ exit(-1);
|
|
|
+ }
|
|
|
+
|
|
|
+ int start = speech_start*seg_sample;
|
|
|
+ int offline_start = speech_offline_start*seg_sample;
|
|
|
+ int end = speech_end_i*seg_sample;
|
|
|
+ int buff_len = end-start;
|
|
|
+ int step = ONLINE_STEP;
|
|
|
+
|
|
|
+ frame = new AudioFrame(end-offline_start);
|
|
|
+ frame->is_final = true;
|
|
|
+ frame->data = (float*)malloc(sizeof(float) * (end-offline_start));
|
|
|
+ memcpy(frame->data, all_samples.data()+offline_start-offset, (end-offline_start)*sizeof(float));
|
|
|
+ asr_offline_queue.push(frame);
|
|
|
+ frame = NULL;
|
|
|
+
|
|
|
+ if(buff_len > 0){
|
|
|
+ for (int sample_offset = 0; sample_offset < buff_len; sample_offset += std::min(step, buff_len - sample_offset)) {
|
|
|
+ bool is_final = false;
|
|
|
+ if (sample_offset + step >= buff_len - 1) {
|
|
|
+ step = buff_len - sample_offset;
|
|
|
+ is_final = true;
|
|
|
+ }
|
|
|
+ frame = new AudioFrame(step);
|
|
|
+ frame->is_final = is_final;
|
|
|
+ frame->data = (float*)malloc(sizeof(float) * step);
|
|
|
+ memcpy(frame->data, all_samples.data()+start-offset+sample_offset, step*sizeof(float));
|
|
|
+ asr_online_queue.push(frame);
|
|
|
+ frame = NULL;
|
|
|
+ }
|
|
|
+ }else{
|
|
|
+ frame = new AudioFrame(0);
|
|
|
+ frame->is_final = true;
|
|
|
+ asr_online_queue.push(frame);
|
|
|
+ frame = NULL;
|
|
|
+ }
|
|
|
+ speech_start = -1;
|
|
|
+ speech_offline_start = -1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // erase all_samples
|
|
|
+ int vector_cache = MODEL_SAMPLE_RATE*2;
|
|
|
+ if(speech_offline_start == -1){
|
|
|
+ if(all_samples.size() > vector_cache){
|
|
|
+ int erase_num = all_samples.size() - vector_cache;
|
|
|
+ all_samples.erase(all_samples.begin(), all_samples.begin()+erase_num);
|
|
|
+ offset += erase_num;
|
|
|
+ }
|
|
|
+ }else{
|
|
|
+ int offline_start = speech_offline_start*seg_sample;
|
|
|
+ if(offline_start-offset > vector_cache){
|
|
|
+ int erase_num = offline_start-offset - vector_cache;
|
|
|
+ all_samples.erase(all_samples.begin(), all_samples.begin()+erase_num);
|
|
|
+ offset += erase_num;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
} // namespace funasr
|