paraformer_bin.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. # -*- encoding: utf-8 -*-
  2. import os.path
  3. from pathlib import Path
  4. from typing import List, Union, Tuple
  5. import copy
  6. import librosa
  7. import numpy as np
  8. from .utils.utils import (CharTokenizer, Hypothesis,
  9. TokenIDConverter, get_logger,
  10. read_yaml)
  11. from .utils.postprocess_utils import sentence_postprocess
  12. from .utils.frontend import WavFrontend
  13. from .utils.timestamp_utils import time_stamp_lfr6_onnx
  14. logging = get_logger()
  15. import torch
  16. class Paraformer():
  17. def __init__(self, model_dir: Union[str, Path] = None,
  18. batch_size: int = 1,
  19. device_id: Union[str, int] = "-1",
  20. plot_timestamp_to: str = "",
  21. quantize: bool = False,
  22. intra_op_num_threads: int = 1,
  23. ):
  24. if not Path(model_dir).exists():
  25. raise FileNotFoundError(f'{model_dir} does not exist.')
  26. model_file = os.path.join(model_dir, 'model.torchscripts')
  27. if quantize:
  28. model_file = os.path.join(model_dir, 'model_quant.torchscripts')
  29. config_file = os.path.join(model_dir, 'config.yaml')
  30. cmvn_file = os.path.join(model_dir, 'am.mvn')
  31. config = read_yaml(config_file)
  32. self.converter = TokenIDConverter(config['token_list'])
  33. self.tokenizer = CharTokenizer()
  34. self.frontend = WavFrontend(
  35. cmvn_file=cmvn_file,
  36. **config['frontend_conf']
  37. )
  38. self.ort_infer = torch.jit.load(model_file)
  39. self.batch_size = batch_size
  40. self.device_id = device_id
  41. self.plot_timestamp_to = plot_timestamp_to
  42. if "predictor_bias" in config['model_conf'].keys():
  43. self.pred_bias = config['model_conf']['predictor_bias']
  44. else:
  45. self.pred_bias = 0
  46. def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
  47. waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
  48. waveform_nums = len(waveform_list)
  49. asr_res = []
  50. for beg_idx in range(0, waveform_nums, self.batch_size):
  51. end_idx = min(waveform_nums, beg_idx + self.batch_size)
  52. feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
  53. try:
  54. with torch.no_grad():
  55. if int(self.device_id) == -1:
  56. outputs = self.ort_infer(feats, feats_len)
  57. am_scores, valid_token_lens = outputs[0], outputs[1]
  58. else:
  59. outputs = self.ort_infer(feats.cuda(), feats_len.cuda())
  60. am_scores, valid_token_lens = outputs[0].cpu(), outputs[1].cpu()
  61. if len(outputs) == 4:
  62. # for BiCifParaformer Inference
  63. us_alphas, us_peaks = outputs[2], outputs[3]
  64. else:
  65. us_alphas, us_peaks = None, None
  66. except:
  67. #logging.warning(traceback.format_exc())
  68. logging.warning("input wav is silence or noise")
  69. preds = ['']
  70. else:
  71. preds = self.decode(am_scores, valid_token_lens)
  72. if us_peaks is None:
  73. for pred in preds:
  74. pred = sentence_postprocess(pred)
  75. asr_res.append({'preds': pred})
  76. else:
  77. for pred, us_peaks_ in zip(preds, us_peaks):
  78. raw_tokens = pred
  79. timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
  80. text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
  81. # logging.warning(timestamp)
  82. if len(self.plot_timestamp_to):
  83. self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
  84. asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
  85. return asr_res
  86. def plot_wave_timestamp(self, wav, text_timestamp, dest):
  87. # TODO: Plot the wav and timestamp results with matplotlib
  88. import matplotlib
  89. matplotlib.use('Agg')
  90. matplotlib.rc("font", family='Alibaba PuHuiTi') # set it to a font that your system supports
  91. import matplotlib.pyplot as plt
  92. fig, ax1 = plt.subplots(figsize=(11, 3.5), dpi=320)
  93. ax2 = ax1.twinx()
  94. ax2.set_ylim([0, 2.0])
  95. # plot waveform
  96. ax1.set_ylim([-0.3, 0.3])
  97. time = np.arange(wav.shape[0]) / 16000
  98. ax1.plot(time, wav/wav.max()*0.3, color='gray', alpha=0.4)
  99. # plot lines and text
  100. for (char, start, end) in text_timestamp:
  101. ax1.vlines(start, -0.3, 0.3, ls='--')
  102. ax1.vlines(end, -0.3, 0.3, ls='--')
  103. x_adj = 0.045 if char != '<sil>' else 0.12
  104. ax1.text((start + end) * 0.5 - x_adj, 0, char)
  105. # plt.legend()
  106. plotname = "{}/timestamp.png".format(dest)
  107. plt.savefig(plotname, bbox_inches='tight')
  108. def load_data(self,
  109. wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
  110. def load_wav(path: str) -> np.ndarray:
  111. waveform, _ = librosa.load(path, sr=fs)
  112. return waveform
  113. if isinstance(wav_content, np.ndarray):
  114. return [wav_content]
  115. if isinstance(wav_content, str):
  116. return [load_wav(wav_content)]
  117. if isinstance(wav_content, list):
  118. return [load_wav(path) for path in wav_content]
  119. raise TypeError(
  120. f'The type of {wav_content} is not in [str, np.ndarray, list]')
  121. def extract_feat(self,
  122. waveform_list: List[np.ndarray]
  123. ) -> Tuple[np.ndarray, np.ndarray]:
  124. feats, feats_len = [], []
  125. for waveform in waveform_list:
  126. speech, _ = self.frontend.fbank(waveform)
  127. feat, feat_len = self.frontend.lfr_cmvn(speech)
  128. feats.append(feat)
  129. feats_len.append(feat_len)
  130. feats = self.pad_feats(feats, np.max(feats_len))
  131. feats_len = np.array(feats_len).astype(np.int32)
  132. feats = torch.from_numpy(feats).type(torch.float32)
  133. feats_len = torch.from_numpy(feats_len).type(torch.int32)
  134. return feats, feats_len
  135. @staticmethod
  136. def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
  137. def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
  138. pad_width = ((0, max_feat_len - cur_len), (0, 0))
  139. return np.pad(feat, pad_width, 'constant', constant_values=0)
  140. feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
  141. feats = np.array(feat_res).astype(np.float32)
  142. return feats
  143. def infer(self, feats: np.ndarray,
  144. feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
  145. outputs = self.ort_infer([feats, feats_len])
  146. return outputs
  147. def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
  148. return [self.decode_one(am_score, token_num)
  149. for am_score, token_num in zip(am_scores, token_nums)]
  150. def decode_one(self,
  151. am_score: np.ndarray,
  152. valid_token_num: int) -> List[str]:
  153. yseq = am_score.argmax(axis=-1)
  154. score = am_score.max(axis=-1)
  155. score = np.sum(score, axis=-1)
  156. # pad with mask tokens to ensure compatibility with sos/eos tokens
  157. # asr_model.sos:1 asr_model.eos:2
  158. yseq = np.array([1] + yseq.tolist() + [2])
  159. hyp = Hypothesis(yseq=yseq, score=score)
  160. # remove sos/eos and get results
  161. last_pos = -1
  162. token_int = hyp.yseq[1:last_pos].tolist()
  163. # remove blank symbol id, which is assumed to be 0
  164. token_int = list(filter(lambda x: x not in (0, 2), token_int))
  165. # Change integer-ids to tokens
  166. token = self.converter.ids2tokens(token_int)
  167. token = token[:valid_token_num-self.pred_bias]
  168. # texts = sentence_postprocess(token)
  169. return token