|
@@ -193,18 +193,28 @@ int AudioFrame::Disp()
|
|
|
return 0;
|
|
return 0;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-Audio::Audio(int data_type) : data_type(data_type)
|
|
|
|
|
|
|
+Audio::Audio(int data_type) : dest_sample_rate(MODEL_SAMPLE_RATE), data_type(data_type)
|
|
|
{
|
|
{
|
|
|
speech_buff = NULL;
|
|
speech_buff = NULL;
|
|
|
speech_data = NULL;
|
|
speech_data = NULL;
|
|
|
align_size = 1360;
|
|
align_size = 1360;
|
|
|
|
|
+ seg_sample = dest_sample_rate / 1000;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-Audio::Audio(int data_type, int size) : data_type(data_type)
|
|
|
|
|
|
|
+Audio::Audio(int model_sample_rate, int data_type) : dest_sample_rate(model_sample_rate), data_type(data_type)
|
|
|
|
|
+{
|
|
|
|
|
+ speech_buff = NULL;
|
|
|
|
|
+ speech_data = NULL;
|
|
|
|
|
+ align_size = 1360;
|
|
|
|
|
+ seg_sample = dest_sample_rate / 1000;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+Audio::Audio(int model_sample_rate, int data_type, int size) : dest_sample_rate(model_sample_rate), data_type(data_type)
|
|
|
{
|
|
{
|
|
|
speech_buff = NULL;
|
|
speech_buff = NULL;
|
|
|
speech_data = NULL;
|
|
speech_data = NULL;
|
|
|
align_size = (float)size;
|
|
align_size = (float)size;
|
|
|
|
|
+ seg_sample = dest_sample_rate / 1000;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
Audio::~Audio()
|
|
Audio::~Audio()
|
|
@@ -222,12 +232,12 @@ Audio::~Audio()
|
|
|
|
|
|
|
|
void Audio::Disp()
|
|
void Audio::Disp()
|
|
|
{
|
|
{
|
|
|
- LOG(INFO) << "Audio time is " << (float)speech_len / MODEL_SAMPLE_RATE << " s. len is " << speech_len;
|
|
|
|
|
|
|
+ LOG(INFO) << "Audio time is " << (float)speech_len / dest_sample_rate << " s. len is " << speech_len;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
float Audio::GetTimeLen()
|
|
float Audio::GetTimeLen()
|
|
|
{
|
|
{
|
|
|
- return (float)speech_len / MODEL_SAMPLE_RATE;
|
|
|
|
|
|
|
+ return (float)speech_len / dest_sample_rate;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void Audio::WavResample(int32_t sampling_rate, const float *waveform,
|
|
void Audio::WavResample(int32_t sampling_rate, const float *waveform,
|
|
@@ -237,13 +247,13 @@ void Audio::WavResample(int32_t sampling_rate, const float *waveform,
|
|
|
<< " in_sample_rate: "<< sampling_rate << "\n"
|
|
<< " in_sample_rate: "<< sampling_rate << "\n"
|
|
|
<< " output_sample_rate: " << static_cast<int32_t>(MODEL_SAMPLE_RATE);
|
|
<< " output_sample_rate: " << static_cast<int32_t>(MODEL_SAMPLE_RATE);
|
|
|
float min_freq =
|
|
float min_freq =
|
|
|
- std::min<int32_t>(sampling_rate, MODEL_SAMPLE_RATE);
|
|
|
|
|
|
|
+ std::min<int32_t>(sampling_rate, dest_sample_rate);
|
|
|
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
|
|
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
|
|
|
|
|
|
|
|
int32_t lowpass_filter_width = 6;
|
|
int32_t lowpass_filter_width = 6;
|
|
|
|
|
|
|
|
auto resampler = std::make_unique<LinearResample>(
|
|
auto resampler = std::make_unique<LinearResample>(
|
|
|
- sampling_rate, MODEL_SAMPLE_RATE, lowpass_cutoff, lowpass_filter_width);
|
|
|
|
|
|
|
+ sampling_rate, dest_sample_rate, lowpass_cutoff, lowpass_filter_width);
|
|
|
std::vector<float> samples;
|
|
std::vector<float> samples;
|
|
|
resampler->Resample(waveform, n, true, &samples);
|
|
resampler->Resample(waveform, n, true, &samples);
|
|
|
//reset speech_data
|
|
//reset speech_data
|
|
@@ -311,7 +321,7 @@ bool Audio::FfmpegLoad(const char *filename, bool copy2char){
|
|
|
nullptr, // allocate a new context
|
|
nullptr, // allocate a new context
|
|
|
AV_CH_LAYOUT_MONO, // output channel layout (stereo)
|
|
AV_CH_LAYOUT_MONO, // output channel layout (stereo)
|
|
|
AV_SAMPLE_FMT_S16, // output sample format (signed 16-bit)
|
|
AV_SAMPLE_FMT_S16, // output sample format (signed 16-bit)
|
|
|
- 16000, // output sample rate (same as input)
|
|
|
|
|
|
|
+ dest_sample_rate, // output sample rate (same as input)
|
|
|
av_get_default_channel_layout(codecContext->channels), // input channel layout
|
|
av_get_default_channel_layout(codecContext->channels), // input channel layout
|
|
|
codecContext->sample_fmt, // input sample format
|
|
codecContext->sample_fmt, // input sample format
|
|
|
codecContext->sample_rate, // input sample rate
|
|
codecContext->sample_rate, // input sample rate
|
|
@@ -347,7 +357,7 @@ bool Audio::FfmpegLoad(const char *filename, bool copy2char){
|
|
|
int in_samples = frame->nb_samples;
|
|
int in_samples = frame->nb_samples;
|
|
|
uint8_t **in_data = frame->extended_data;
|
|
uint8_t **in_data = frame->extended_data;
|
|
|
int out_samples = av_rescale_rnd(in_samples,
|
|
int out_samples = av_rescale_rnd(in_samples,
|
|
|
- 16000,
|
|
|
|
|
|
|
+ dest_sample_rate,
|
|
|
codecContext->sample_rate,
|
|
codecContext->sample_rate,
|
|
|
AV_ROUND_DOWN);
|
|
AV_ROUND_DOWN);
|
|
|
|
|
|
|
@@ -494,7 +504,7 @@ bool Audio::FfmpegLoad(const char* buf, int n_file_len){
|
|
|
nullptr, // allocate a new context
|
|
nullptr, // allocate a new context
|
|
|
AV_CH_LAYOUT_MONO, // output channel layout (stereo)
|
|
AV_CH_LAYOUT_MONO, // output channel layout (stereo)
|
|
|
AV_SAMPLE_FMT_S16, // output sample format (signed 16-bit)
|
|
AV_SAMPLE_FMT_S16, // output sample format (signed 16-bit)
|
|
|
- 16000, // output sample rate (same as input)
|
|
|
|
|
|
|
+ dest_sample_rate, // output sample rate (same as input)
|
|
|
av_get_default_channel_layout(codecContext->channels), // input channel layout
|
|
av_get_default_channel_layout(codecContext->channels), // input channel layout
|
|
|
codecContext->sample_fmt, // input sample format
|
|
codecContext->sample_fmt, // input sample format
|
|
|
codecContext->sample_rate, // input sample rate
|
|
codecContext->sample_rate, // input sample rate
|
|
@@ -532,7 +542,7 @@ bool Audio::FfmpegLoad(const char* buf, int n_file_len){
|
|
|
int in_samples = frame->nb_samples;
|
|
int in_samples = frame->nb_samples;
|
|
|
uint8_t **in_data = frame->extended_data;
|
|
uint8_t **in_data = frame->extended_data;
|
|
|
int out_samples = av_rescale_rnd(in_samples,
|
|
int out_samples = av_rescale_rnd(in_samples,
|
|
|
- 16000,
|
|
|
|
|
|
|
+ dest_sample_rate,
|
|
|
codecContext->sample_rate,
|
|
codecContext->sample_rate,
|
|
|
AV_ROUND_DOWN);
|
|
AV_ROUND_DOWN);
|
|
|
|
|
|
|
@@ -666,7 +676,7 @@ bool Audio::LoadWav(const char *filename, int32_t* sampling_rate)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//resample
|
|
//resample
|
|
|
- if(*sampling_rate != MODEL_SAMPLE_RATE){
|
|
|
|
|
|
|
+ if(*sampling_rate != dest_sample_rate){
|
|
|
WavResample(*sampling_rate, speech_data, speech_len);
|
|
WavResample(*sampling_rate, speech_data, speech_len);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -752,7 +762,7 @@ bool Audio::LoadWav(const char* buf, int n_file_len, int32_t* sampling_rate)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//resample
|
|
//resample
|
|
|
- if(*sampling_rate != MODEL_SAMPLE_RATE){
|
|
|
|
|
|
|
+ if(*sampling_rate != dest_sample_rate){
|
|
|
WavResample(*sampling_rate, speech_data, speech_len);
|
|
WavResample(*sampling_rate, speech_data, speech_len);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -795,7 +805,7 @@ bool Audio::LoadPcmwav(const char* buf, int n_buf_len, int32_t* sampling_rate)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//resample
|
|
//resample
|
|
|
- if(*sampling_rate != MODEL_SAMPLE_RATE){
|
|
|
|
|
|
|
+ if(*sampling_rate != dest_sample_rate){
|
|
|
WavResample(*sampling_rate, speech_data, speech_len);
|
|
WavResample(*sampling_rate, speech_data, speech_len);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -840,7 +850,7 @@ bool Audio::LoadPcmwavOnline(const char* buf, int n_buf_len, int32_t* sampling_r
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//resample
|
|
//resample
|
|
|
- if(*sampling_rate != MODEL_SAMPLE_RATE){
|
|
|
|
|
|
|
+ if(*sampling_rate != dest_sample_rate){
|
|
|
WavResample(*sampling_rate, speech_data, speech_len);
|
|
WavResample(*sampling_rate, speech_data, speech_len);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -898,7 +908,7 @@ bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//resample
|
|
//resample
|
|
|
- if(*sampling_rate != MODEL_SAMPLE_RATE){
|
|
|
|
|
|
|
+ if(*sampling_rate != dest_sample_rate){
|
|
|
WavResample(*sampling_rate, speech_data, speech_len);
|
|
WavResample(*sampling_rate, speech_data, speech_len);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -1009,7 +1019,7 @@ int Audio::Fetch(float *&dout, int &len, int &flag, float &start_time)
|
|
|
AudioFrame *frame = frame_queue.front();
|
|
AudioFrame *frame = frame_queue.front();
|
|
|
frame_queue.pop();
|
|
frame_queue.pop();
|
|
|
|
|
|
|
|
- start_time = (float)(frame->GetStart())/MODEL_SAMPLE_RATE;
|
|
|
|
|
|
|
+ start_time = (float)(frame->GetStart())/ dest_sample_rate;
|
|
|
dout = speech_data + frame->GetStart();
|
|
dout = speech_data + frame->GetStart();
|
|
|
len = frame->GetLen();
|
|
len = frame->GetLen();
|
|
|
delete frame;
|
|
delete frame;
|
|
@@ -1248,7 +1258,7 @@ void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYP
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// erase all_samples
|
|
// erase all_samples
|
|
|
- int vector_cache = MODEL_SAMPLE_RATE*2;
|
|
|
|
|
|
|
+ int vector_cache = dest_sample_rate*2;
|
|
|
if(speech_offline_start == -1){
|
|
if(speech_offline_start == -1){
|
|
|
if(all_samples.size() > vector_cache){
|
|
if(all_samples.size() > vector_cache){
|
|
|
int erase_num = all_samples.size() - vector_cache;
|
|
int erase_num = all_samples.size() - vector_cache;
|