audio.h 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. #ifndef AUDIO_H
  2. #define AUDIO_H
  3. #include <queue>
  4. #include <stdint.h>
  5. #include "vad-model.h"
  6. #include "offline-stream.h"
  7. #include "com-define.h"
  8. #ifndef WAV_HEADER_SIZE
  9. #define WAV_HEADER_SIZE 44
  10. #endif
  11. using namespace std;
  12. namespace funasr {
  13. class AudioFrame {
  14. private:
  15. int start;
  16. int end;
  17. public:
  18. AudioFrame();
  19. AudioFrame(int len);
  20. AudioFrame(const AudioFrame &other);
  21. AudioFrame(int start, int end, bool is_final);
  22. ~AudioFrame();
  23. int SetStart(int val);
  24. int SetEnd(int val);
  25. int GetStart();
  26. int GetLen();
  27. int Disp();
  28. // 2pass
  29. bool is_final = false;
  30. float* data = nullptr;
  31. int len;
  32. int global_start = 0; // the start of a frame in the global time axis. in ms
  33. int global_end = 0; // the end of a frame in the global time axis. in ms
  34. };
  35. #ifdef _WIN32
  36. #ifdef _FUNASR_API_EXPORT
  37. #define DLLAPI __declspec(dllexport)
  38. #else
  39. #define DLLAPI __declspec(dllimport)
  40. #endif
  41. #else
  42. #define DLLAPI
  43. #endif
  44. class DLLAPI Audio {
  45. private:
  46. float *speech_data=nullptr;
  47. int16_t *speech_buff=nullptr;
  48. char* speech_char=nullptr;
  49. int speech_len;
  50. int speech_align_len;
  51. float align_size;
  52. int data_type;
  53. queue<AudioFrame *> frame_queue;
  54. queue<AudioFrame *> asr_online_queue;
  55. queue<AudioFrame *> asr_offline_queue;
  56. int dest_sample_rate;
  57. public:
  58. Audio(int data_type);
  59. Audio(int model_sample_rate,int data_type);
  60. Audio(int model_sample_rate,int data_type, int size);
  61. ~Audio();
  62. void ClearQueue(std::queue<AudioFrame*>& q);
  63. void Disp();
  64. void WavResample(int32_t sampling_rate, const float *waveform, int32_t n);
  65. bool LoadWav(const char* buf, int n_len, int32_t* sampling_rate);
  66. bool LoadWav(const char* filename, int32_t* sampling_rate, bool resample=true);
  67. bool LoadWav2Char(const char* filename, int32_t* sampling_rate);
  68. bool LoadPcmwav(const char* buf, int n_file_len, int32_t* sampling_rate);
  69. bool LoadPcmwav(const char* filename, int32_t* sampling_rate, bool resample=true);
  70. bool LoadPcmwav2Char(const char* filename, int32_t* sampling_rate);
  71. bool LoadOthers2Char(const char* filename);
  72. bool FfmpegLoad(const char *filename, bool copy2char=false);
  73. bool FfmpegLoad(const char* buf, int n_file_len);
  74. int FetchChunck(AudioFrame *&frame);
  75. int FetchTpass(AudioFrame *&frame);
  76. int Fetch(float *&dout, int &len, int &flag);
  77. int Fetch(float *&dout, int &len, int &flag, float &start_time);
  78. void Padding();
  79. void Split(OfflineStream* offline_streamj);
  80. void CutSplit(OfflineStream* offline_streamj);
  81. void Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments, bool input_finished=true);
  82. void Split(VadModel* vad_obj, int chunk_len, bool input_finished=true, ASR_TYPE asr_mode=ASR_TWO_PASS);
  83. float GetTimeLen();
  84. int GetQueueSize() { return (int)frame_queue.size(); }
  85. char* GetSpeechChar(){return speech_char;}
  86. int GetSpeechLen(){return speech_len;}
  87. // 2pass
  88. vector<float> all_samples;
  89. int offset = 0;
  90. int speech_start=-1, speech_end=0;
  91. int speech_offline_start=-1;
  92. int seg_sample = MODEL_SAMPLE_RATE/1000;
  93. bool LoadPcmwavOnline(const char* buf, int n_file_len, int32_t* sampling_rate);
  94. void ResetIndex(){
  95. speech_start=-1;
  96. speech_end=0;
  97. speech_offline_start=-1;
  98. offset = 0;
  99. all_samples.clear();
  100. }
  101. };
  102. } // namespace funasr
  103. #endif